In [1]:
from google.colab import drive
drive.mount('/gdrive')
%cd "/gdrive/MyDrive/captcha"

Mounted at /gdrive
/gdrive/MyDrive/captcha


In [2]:
%ls

 captcha.ipynb   kaggle.json   model_weights.hdf5  [0m[01;34m'persian_digits '[0m/   [01;34mpics[0m/


In [3]:
# ! mkdir ~/.kaggle
# ! cp kaggle.json ~/.kaggle/
# ! chmod 600 ~/.kaggle/kaggle.json

# !kaggle datasets download -d aliassareh1/persian-digits-captcha
# %ls

Downloading persian-digits-captcha.zip to /gdrive/MyDrive/captcha
 99% 249M/252M [00:08<00:00, 29.5MB/s]
100% 252M/252M [00:08<00:00, 31.0MB/s]
 captcha.ipynb   model_weights.hdf5   persian-digits-captcha.zip
 kaggle.json    [0m[01;34m'persian_digits '[0m/    [01;34mpics[0m/


In [None]:
# ! unzip 'persian-digits-captcha.zip' -d 'persian_digits '

In [7]:
import os
from PIL import Image, ImageOps
import numpy as np
from keras_preprocessing.sequence import pad_sequences

from keras.layers import Dense, LSTM, Reshape, BatchNormalization, Input, Conv2D, MaxPool2D, Lambda, Bidirectional
from keras.models import Model
import keras.backend as K
from keras.callbacks import ModelCheckpoint

In [10]:
filenames = []
for dirname, _, files in os.walk('./persian_digits '):
  for f in files:
    filenames = np.append(filenames, f)
        
num_samples = len(filenames)
print('number of samples: ', num_samples)

number of samples:  30000


In [11]:
# list of our characters
char_list = "0123456789"

In [12]:
def encode_to_labels(txt):
    # encoding each label into list of digits
    encoded_list = []
    for char in txt:
        encoded_list.append(char_list.index(char))
    
    return encoded_list

In [40]:
path = '/gdrive/MyDrive/captcha/persian_digits /dataset/'

# lists for training dataset
training_img = []                    # the images for training the model
training_txt = []                    # the labels
train_input_length = []              # the input of LSTM part of the model
train_label_length = []              # the label's length (4 to 7)
train_orig_txt = []
 
#lists for validation dataset
valid_img = []
valid_txt = []
valid_input_length = []
valid_label_length = []
valid_orig_txt = []
 
max_label_len = 0                    # max length for our labels (in this case 7)

In [41]:
for file in filenames:
    raw = Image.open(path + file)
    gray = ImageOps.grayscale(raw)
    img = np.array(gray)
    img = np.expand_dims(img , axis = 2)
    img = img/255.

    txt = file.split('.')[0]

    if len(txt) > max_label_len:
        max_label_len = len(txt)
    
    # split the dataset (85% train, 15% test)
    if np.random.rand() >= 0.85:
        valid_orig_txt.append(txt)
        valid_label_length.append(len(txt))
        valid_input_length.append(75)
        valid_img.append(img)
        valid_txt.append(encode_to_labels(txt))
    else:
        train_orig_txt.append(txt)
        train_label_length.append(len(txt))
        train_input_length.append(75)
        training_img.append(img)
        training_txt.append(encode_to_labels(txt))

In [21]:
print('number of training images: ', len(training_img))
print('number of validation images: ', len(valid_img))

number of training images:  8486
number of validation images:  1514


In [22]:
print(max_label_len)

7


In [23]:
# pad each output label to maximum text length
 
train_padded_txt = pad_sequences(training_txt, maxlen=max_label_len, padding='post', value = len(char_list))
valid_padded_txt = pad_sequences(valid_txt, maxlen=max_label_len, padding='post', value = len(char_list))

In [25]:
import cv2

im = cv2.imread('/gdrive/MyDrive/captcha/persian_digits /dataset/000001.jpg')

print(type(im))
# <class 'numpy.ndarray'>

print(im.shape)
print(type(im.shape))

<class 'numpy.ndarray'>
(64, 306, 3)
<class 'tuple'>


In [26]:
# input with shape of height=64 and width=306
inputs = Input(shape=(64,306,1))
 

conv_1 = Conv2D(64, (3,3), activation = 'relu', padding='same')(inputs)
pool_1 = MaxPool2D(pool_size=(2, 2), strides=2)(conv_1)
 
conv_2 = Conv2D(128, (3,3), activation = 'relu', padding='same')(pool_1)
pool_2 = MaxPool2D(pool_size=(2, 2), strides=2)(conv_2)
 
conv_3 = Conv2D(256, (3,3), activation = 'relu', padding='same')(pool_2)
conv_4 = Conv2D(256, (3,3), activation = 'relu', padding='same')(conv_3)
pool_4 = MaxPool2D(pool_size=(2, 1))(conv_4)
 
conv_5 = Conv2D(512, (3,3), activation = 'relu', padding='same')(pool_4)
batch_norm_5 = BatchNormalization()(conv_5)
 
conv_6 = Conv2D(512, (3,3), activation = 'relu', padding='same')(batch_norm_5)
batch_norm_6 = BatchNormalization()(conv_6)
pool_6 = MaxPool2D(pool_size=(2, 1))(batch_norm_6)

conv_7 = Conv2D(512, (3,3), activation = 'relu', padding='same')(pool_6)
batch_norm_7 = BatchNormalization()(conv_7)
pool_7 = MaxPool2D(pool_size=(2, 1))(batch_norm_7)

conv_8 = Conv2D(512, (2,2), activation = 'relu')(pool_7)
 
squeezed = Lambda(lambda x: K.squeeze(x, 1))(conv_8)
 

blstm_1 = Bidirectional(LSTM(128, return_sequences=True, dropout = 0.2))(squeezed)
blstm_2 = Bidirectional(LSTM(128, return_sequences=True, dropout = 0.2))(blstm_1)
 
outputs = Dense(len(char_list)+1, activation = 'softmax')(blstm_2)

# this is the model we use for our prediction(after training)
prediction_model = Model(inputs, outputs)

In [27]:
labels = Input(name='the_labels', shape=[max_label_len], dtype='float32')
input_length = Input(name='input_length', shape=[1], dtype='int64')
label_length = Input(name='label_length', shape=[1], dtype='int64')
 
 
def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args
 
    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)
 
 
loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([outputs, labels, input_length, label_length])

In [28]:
# this is the model for training
model = Model(inputs=[inputs, labels, input_length, label_length], outputs=loss_out)

model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer = 'adam')

In [29]:
print(model.summary())

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 64, 306, 1)  0           []                               
                                ]                                                                 
                                                                                                  
 conv2d (Conv2D)                (None, 64, 306, 64)  640         ['input_1[0][0]']                
                                                                                                  
 max_pooling2d (MaxPooling2D)   (None, 32, 153, 64)  0           ['conv2d[0][0]']                 
                                                                                                  
 conv2d_1 (Conv2D)              (None, 32, 153, 128  73856       ['max_pooling2d[0][0]']    

In [30]:
training_img = np.array(training_img)
train_input_length = np.array(train_input_length)
train_label_length = np.array(train_label_length)

valid_img = np.array(valid_img)
valid_input_length = np.array(valid_input_length)
valid_label_length = np.array(valid_label_length)

training_txt = np.array(training_txt)
valid_txt = np.array(valid_txt)

batch_size = 32
epochs = 10

  training_txt = np.array(training_txt)
  valid_txt = np.array(valid_txt)


In [31]:
model.fit(x=[training_img, train_padded_txt, train_input_length, train_label_length],
          y=np.zeros(len(training_img)),
          batch_size=batch_size,
          epochs = epochs,
          validation_data = ([valid_img, valid_padded_txt, valid_input_length, valid_label_length], [np.zeros(len(valid_img))]),
          verbose = 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f6d4851ad00>

In [32]:
model.save_weights('model_weights_V1.hdf5')

In [36]:
# load the model weights
prediction_model.load_weights('model_weights_V1.hdf5')

# predict outputs on validation images
prediction = prediction_model.predict(valid_img[:10])

# use CTC decoder
out = K.get_value(K.ctc_decode(prediction, input_length=np.ones(prediction.shape[0])*prediction.shape[1],
                         greedy=True)[0][0])
 
# see the results
i = 0
for x in out:
    print("original_text =  ", valid_orig_txt[i])
    print("predicted text = ", end = '')
    for p in x:  
        if int(p) != -1:
            print(char_list[int(p)], end = '')       
    print('\n')
    i+=1

original_text =   0000232
predicted text = 0000232

original_text =   000124
predicted text = 000124

original_text =   0003449
predicted text = 0003449

original_text =   00051
predicted text = 00051

original_text =   000975
predicted text = 000975

original_text =   0011345
predicted text = 0011345

original_text =   0014867
predicted text = 0014867

original_text =   00181
predicted text = 00181

original_text =   002008
predicted text = 002008

original_text =   002374
predicted text = 002374



In [37]:
%ls

 captcha.ipynb   model_weights.hdf5     [0m[01;34m'persian_digits '[0m/            [01;34mpics[0m/
 kaggle.json     model_weights_V1.hdf5   persian-digits-captcha.zip
