In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

In [2]:
import tensorflow as tf

# You'll generate plots of attention in order to see which parts of an image
# our model focuses on during captioning
import matplotlib.pyplot as plt

# Scikit-learn includes many helpful utilities
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import re
import numpy as np
import os
import time
import json
from glob import glob
from PIL import Image
import pickle
from tqdm import tqdm

from tensorflow import keras

In [3]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Restrict TensorFlow to only use the first GPU
        tf.config.experimental.set_visible_devices(gpus[2], 'GPU')

        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

3 Physical GPUs, 1 Logical GPUs


In [4]:
BATCH_SIZE = 40
SHUFFLE_BUFFER_SIZE = 5000
embedding_dim = 256
units = 512
IMAGE_HEIGHT = 300
IMAGE_WIDTH = 160
LEARNING_RATE = 5e-5
IMAGE_DIR = './words_captcha/'
annotation_file = './words_captcha/spec_train_val.txt'

### Reading the annotation txt file, and split into training and validation set

In [5]:
with open(annotation_file, 'r') as f:
    lines = f.readlines()
f.close()

train_img_name = []
val_img_name = []
train_annotation = []
val_annotation = []
num = 0

for line in lines:
    line = line.strip('\n')
    line = line.split(' ')
    if num<100000:
        train_img_name.append(line[0])
        train_annotation.append(line[1])
    else:
        val_img_name.append(line[0])
        val_annotation.append(line[1])
    num+=1

In [6]:
print(len(train_img_name), len(val_img_name))
print(train_img_name[0] , train_annotation[0])
print(val_img_name[0], val_annotation[0])

100000 20000
a0 thus
a100000 cio


### Create the character_to_index and index_to_character dictionary, and map the origin word to list of index.

In [7]:
def max_length(annotations):
    max_len = 0
    for annotation in annotations:
        if len(annotation) > max_len:
            max_len = len(annotation)
    return max_len

In [8]:
character_to_idx = {}
idx_to_character = {}
character_to_idx['<pad>'] = 0
idx_to_character[0] = '<pad>'
index = 1

for annotation in (train_annotation):
    for character in annotation:
        if character not in character_to_idx:
            character_to_idx[character] = index
            idx_to_character[index] = character
            index+=1

In [9]:
character_to_idx

{'<pad>': 0,
 't': 1,
 'h': 2,
 'u': 3,
 's': 4,
 'w': 5,
 'i': 6,
 'e': 7,
 'd': 8,
 'j': 9,
 'a': 10,
 'm': 11,
 'z': 12,
 'o': 13,
 'p': 14,
 'l': 15,
 'b': 16,
 'g': 17,
 'v': 18,
 'k': 19,
 'y': 20,
 'n': 21,
 'r': 22,
 'c': 23,
 'q': 24,
 'f': 25,
 'x': 26}

In [10]:
character_to_idx['<start>'] = 27
idx_to_character[27] = '<start>'

character_to_idx['<end>'] = 28
idx_to_character[28] = '<end>'

In [11]:
character_to_idx

{'<pad>': 0,
 't': 1,
 'h': 2,
 'u': 3,
 's': 4,
 'w': 5,
 'i': 6,
 'e': 7,
 'd': 8,
 'j': 9,
 'a': 10,
 'm': 11,
 'z': 12,
 'o': 13,
 'p': 14,
 'l': 15,
 'b': 16,
 'g': 17,
 'v': 18,
 'k': 19,
 'y': 20,
 'n': 21,
 'r': 22,
 'c': 23,
 'q': 24,
 'f': 25,
 'x': 26,
 '<start>': 27,
 '<end>': 28}

In [12]:
# Find out the max_length
max_len = max_length(train_annotation) + 2
max_len

7

In [13]:
train_annotation_idx = []
val_annotation_idx = []

for annotation in train_annotation:
    annotation_idx = [27]
    for character in annotation:
        annotation_idx.append(character_to_idx[character])
    annotation_idx.append(28)
    while len(annotation_idx) < max_len:
        annotation_idx.append(0)
    train_annotation_idx.append(annotation_idx)
    
for annotation in val_annotation:
    annotation_idx = [27]
    for character in annotation:
        annotation_idx.append(character_to_idx[character])
    annotation_idx.append(28)
    while len(annotation_idx) < max_len:
        annotation_idx.append(0)
    val_annotation_idx.append(annotation_idx)

In [14]:
train_annotation_idx[:5]

[[27, 1, 2, 3, 4, 28, 0],
 [27, 5, 5, 5, 28, 0, 0],
 [27, 1, 6, 7, 8, 28, 0],
 [27, 6, 8, 4, 28, 0, 0],
 [27, 9, 10, 11, 28, 0, 0]]

In [15]:
val_annotation_idx[:5]

[[27, 23, 6, 13, 28, 0, 0],
 [27, 17, 11, 1, 28, 0, 0],
 [27, 15, 6, 18, 7, 4, 28],
 [27, 4, 2, 13, 5, 21, 28],
 [27, 4, 19, 20, 28, 0, 0]]

### Create the train dataset and validation dataset

In [16]:
def load_image(image_name, annotation):
    img = tf.io.read_file(IMAGE_DIR + image_name + '.png')
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, (IMAGE_HEIGHT, IMAGE_WIDTH))
    img = img/255 - 1.
    return img, annotation

In [17]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_img_name,train_annotation_idx))
val_dataset = tf.data.Dataset.from_tensor_slices((val_img_name,val_annotation_idx))

train_dataset = train_dataset.map(load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)
val_dataset = val_dataset.map(load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)

train_dataset = train_dataset.shuffle(SHUFFLE_BUFFER_SIZE)
train_dataset = train_dataset.batch(BATCH_SIZE)
train_dataset = train_dataset.prefetch(200)

val_dataset = val_dataset.shuffle(SHUFFLE_BUFFER_SIZE)
val_dataset = val_dataset.batch(BATCH_SIZE)
val_dataset = val_dataset.prefetch(200)

In [18]:
train_dataset

<PrefetchDataset shapes: ((None, 300, 160, 3), (None, 7)), types: (tf.float32, tf.int32)>

### Construct the model

In [19]:
vocab_size = len(character_to_idx)
num_steps = len(train_img_name) // BATCH_SIZE
val_num_steps = len(val_img_name) // BATCH_SIZE

In [20]:
class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, features, hidden):
        # features(CNN_encoder output) shape == (batch_size, 64, embedding_dim)

        # hidden shape == (batch_size, hidden_size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden_size)
        hidden_with_time_axis = tf.expand_dims(hidden, 1)

        # score shape == (batch_size, 64, hidden_size)
        score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))

        # attention_weights shape == (batch_size, 64, 1)
        # you get 1 at the last axis because you are applying score to self.V
        attention_weights = tf.nn.softmax(self.V(score), axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

### Design feature extracter

In [21]:
class conv_relu(tf.keras.layers.Layer):
    def __init__(self, filters, size, stride):
        super(conv_relu, self).__init__()
        self.conv = tf.keras.layers.Conv2D(filters, size, stride, padding="same",
                      kernel_initializer=tf.keras.initializers.TruncatedNormal())
        self.batchnorm = tf.keras.layers.BatchNormalization()
        self.lkrelu = tf.keras.layers.LeakyReLU(0.1)

    def call(self, inputs, training):
        x = self.conv(inputs)
        x = self.batchnorm(x,training = training)
        x = self.lkrelu(x)
        return x

In [22]:
class Feature_Extracter(tf.keras.Model):

    def __init__(self):
        super(Feature_Extracter, self).__init__()
        self.cr1 = conv_relu(64,3,1)
        self.cr2 = conv_relu(64,3,1)
        self.max_pooling1 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2))
        self.cr3 = conv_relu(128,3,1)
        self.cr4 = conv_relu(128,3,1)
        self.max_pooling2 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2))
        self.cr5 = conv_relu(256,3,1)
        self.cr6 = conv_relu(256,3,1)
        self.cr7 = conv_relu(256,3,1)
        self.max_pooling3 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2))
        self.cr8 = conv_relu(512,3,1)
        self.cr9 = conv_relu(512,3,1)
        self.cr10 = conv_relu(512,3,1)
        self.max_pooling4 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2))
        self.cr11 = conv_relu(512,3,1)
        self.cr12 = conv_relu(512,3,1)
        self.cr13 = conv_relu(512,3,1)
        self.max_pooling5 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2))
        self.cr14 = conv_relu(1024,3,1)
        self.cr15 = conv_relu(1024,3,1)
        self.cr16 = conv_relu(1024,3,1)

    def call(self, inputs, training):
        x = self.cr1(inputs,training)
        x = self.cr2(x,training)
        x = self.max_pooling1(x)
        x = self.cr3(x,training)
        x = self.cr4(x,training)
        x = self.max_pooling2(x)
        x = self.cr5(x,training)
        x = self.cr6(x,training)
        x = self.cr7(x,training)
        x = self.max_pooling3(x)
        x = self.cr8(x,training)
        x = self.cr9(x,training)
        x = self.cr10(x,training)
        x = self.max_pooling4(x)
        x = self.cr11(x,training)
        x = self.cr12(x,training)
        x = self.cr13(x,training)
        x = self.max_pooling5(x)
        x = self.cr14(x,training)
        x = self.cr15(x,training)
        x = self.cr16(x,training)
        return x

In [23]:
class CNN_Encoder(tf.keras.Model):
    # Since you have already extracted the features and dumped it using pickle
    # This encoder passes those features through a Fully connected layer
    def __init__(self, embedding_dim):
        super(CNN_Encoder, self).__init__()
        # shape after fc == (batch_size, 64, embedding_dim)
        self.fc = tf.keras.layers.Dense(embedding_dim)

    def call(self, x):
        x = self.fc(x)
        x = tf.nn.relu(x)
        return x

In [24]:
feature_extracter = Feature_Extracter()
feature_extracter.build((None,IMAGE_HEIGHT,IMAGE_WIDTH,3))
feature_extracter.summary()

Model: "feature__extracter"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv_relu (conv_relu)        multiple                  2048      
_________________________________________________________________
conv_relu_1 (conv_relu)      multiple                  37184     
_________________________________________________________________
max_pooling2d (MaxPooling2D) multiple                  0         
_________________________________________________________________
conv_relu_2 (conv_relu)      multiple                  74368     
_________________________________________________________________
conv_relu_3 (conv_relu)      multiple                  148096    
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 multiple                  0         
_________________________________________________________________
conv_relu_4 (conv_relu)      multiple           

In [25]:
encoder = CNN_Encoder(embedding_dim)

In [26]:
class RNN_Decoder(tf.keras.Model):
    def __init__(self, embedding_dim, units, vocab_size):
        super(RNN_Decoder, self).__init__()
        self.units = units

        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc1 = tf.keras.layers.Dense(self.units)
        self.fc2 = tf.keras.layers.Dense(vocab_size)

        self.attention = BahdanauAttention(self.units)

    def call(self, x, features, hidden):
        # defining attention as a separate model
        context_vector, attention_weights = self.attention(features, hidden)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output, state = self.gru(x)

        # shape == (batch_size, max_length, hidden_size)
        x = self.fc1(output)

        # x shape == (batch_size * max_length, hidden_size)
        x = tf.reshape(x, (-1, x.shape[2]))

        # output shape == (batch_size * max_length, vocab)
        x = self.fc2(x)

        return x, state, attention_weights

    def reset_state(self, batch_size):
        return tf.zeros((batch_size, self.units))

In [27]:
decoder = RNN_Decoder(embedding_dim, units, vocab_size)

In [28]:
optimizer = tf.keras.optimizers.Adam(LEARNING_RATE)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    
    return tf.reduce_mean(loss_)

In [29]:
checkpoint_path = "./checkpoints/train_v4"
ckpt = tf.train.Checkpoint(feature_extracter=feature_extracter,
                           encoder=encoder,
                           decoder=decoder,
                           optimizer = optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=10)

In [30]:
start_epoch = 0
if ckpt_manager.latest_checkpoint:
    start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1])

In [31]:
# adding this in a separate cell because if you run the training cell
# many times, the loss_plot array will be reset
loss_plot = []

In [32]:
features_shape = 1024
attention_features_shape = 45

In [33]:
@tf.function
def train_step(img_tensor, target):
    loss = 0

    # initializing the hidden state for each batch
    # because the captions are not related from image to image
    hidden = decoder.reset_state(batch_size=target.shape[0])

    dec_input = tf.expand_dims([character_to_idx['<start>']] * BATCH_SIZE, 1)

    with tf.GradientTape() as tape:
        features = feature_extracter(img_tensor,True)
        features = tf.reshape(features,(features.shape[0], -1, features.shape[3]))
        features = encoder(features)

        for i in range(1, target.shape[1]):
            # passing the features through the decoder
            predictions, hidden, _ = decoder(dec_input, features, hidden)

            loss += loss_function(target[:, i], predictions)

            # using teacher forcing
            dec_input = tf.expand_dims(target[:, i], 1)

    total_loss = (loss / int(target.shape[1]))

    trainable_variables = feature_extracter.trainable_variables + encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, trainable_variables)

    optimizer.apply_gradients(zip(gradients, trainable_variables))

    return loss, total_loss

In [34]:
EPOCHS = 10

for epoch in range(start_epoch, EPOCHS):
    start = time.time()
    total_loss = 0
    #total_val_loss = 0

    for (batch, (img_tensor, target)) in enumerate(train_dataset):
        batch_loss, t_loss = train_step(img_tensor, target)
        total_loss += t_loss
        print ('Epoch {} {}/{} Train Loss {:.6f}'.format(epoch + 1,batch+1,num_steps,total_loss/(batch+1)),end='\r')
    print('')
    equal_num = 0
    total_val_loss = 0
    for (batch, (img_tensor, target)) in enumerate(val_dataset):
        val_loss = 0
        hidden = decoder.reset_state(batch_size=target.shape[0])
        dec_input = tf.expand_dims([character_to_idx['<start>']]*BATCH_SIZE, 1)
        features = feature_extracter(img_tensor,False)
        features = tf.reshape(features,(features.shape[0], -1, features.shape[3]))
        features = encoder(features)
        result = np.full((BATCH_SIZE, 1), 27)
        for i in range(1, target.shape[1]):
            # passing the features through the decoder
            predictions, hidden, _ = decoder(dec_input, features, hidden)
            predicted_id = tf.argmax(predictions,axis=1).numpy()
            val_loss += loss_function(target[:, i], predictions)
            result = np.concatenate((result, predicted_id.reshape((BATCH_SIZE,1))), axis=1)
            dec_input = tf.expand_dims(predicted_id, 1)
        target_array = target.numpy()
        total_val_loss += (val_loss / int(target.shape[1]))
        for i in range(BATCH_SIZE):
            for j in range(max_len):
                if result[i][j] == 28 and target_array[i][j] == 28:
                    if (result[i][1:j] == target_array[i][1:j]).all():
                        equal_num+=1
                    break
        print ('Validation Accuracy {:.6f}, Validation Loss {:.6f}'.format(float(equal_num)/((batch+1)*BATCH_SIZE),total_val_loss/(batch+1)),end='\r')
    
    print('')
#         if batch % 100 == 0:
#             print ('Epoch {} Batch {} Loss {:.4f}'.format(
#               epoch + 1, batch, batch_loss.numpy() / int(target.shape[1])))
    # storing the epoch end loss value to plot later
    loss_plot.append(total_loss / num_steps)

    ckpt_manager.save()
    output_string = 'Epoch {} Train Loss {:.6f} Validation Accuracy {:.6f} Validation Loss {:.6f}\n'.format(epoch + 1,
                                                             total_loss/num_steps,float(equal_num)/20000.,total_val_loss/val_num_steps)
    with open('./lab13-2_v4.log','a') as f:
        f.write(output_string)
    f.close()
    print ('Epoch {} Train Loss {:.6f} Validation Accuracy {:.6f}'.format(epoch + 1,
                                                             total_loss/num_steps,float(equal_num)/20000.))
    print ('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 2500/2500 Train Loss 1.108733
Validation Accuracy 0.431200, Validation Loss 0.390321
Epoch 1 Train Loss 1.108733 Validation Accuracy 0.431200
Time taken for 1 epoch 2970.0059263706207 sec

Epoch 2 2500/2500 Train Loss 0.092519
Validation Accuracy 0.862950, Validation Loss 0.077941
Epoch 2 Train Loss 0.092519 Validation Accuracy 0.862950
Time taken for 1 epoch 1035.8364934921265 sec

Epoch 3 2500/2500 Train Loss 0.031428
Validation Accuracy 0.837150, Validation Loss 0.088365
Epoch 3 Train Loss 0.031428 Validation Accuracy 0.837150
Time taken for 1 epoch 1032.3557267189026 sec

Epoch 4 2500/2500 Train Loss 0.016177
Validation Accuracy 0.913550, Validation Loss 0.053589
Epoch 4 Train Loss 0.016177 Validation Accuracy 0.913550
Time taken for 1 epoch 1032.164873123169 sec

Epoch 5 2500/2500 Train Loss 0.012634
Validation Accuracy 0.546050, Validation Loss 0.393007
Epoch 5 Train Loss 0.012634 Validation Accuracy 0.546050
Time taken for 1 epoch 1039.4833495616913 sec

Epoch 6 2500/250

## Restore ckpt9 

In [33]:
ckpt.restore('./checkpoints/train_v4/ckpt-9')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f11c86b4430>

In [36]:
equal_num = 0
total_val_loss = 0
for (batch, (img_tensor, target)) in enumerate(val_dataset):
    val_loss = 0
    hidden = decoder.reset_state(batch_size=target.shape[0])
    dec_input = tf.expand_dims([character_to_idx['<start>']]*BATCH_SIZE, 1)
    features = feature_extracter(img_tensor,False)
    features = tf.reshape(features,(features.shape[0], -1, features.shape[3]))
    features = encoder(features)
    result = np.full((BATCH_SIZE, 1), 27)
    for i in range(1, target.shape[1]):
        # passing the features through the decoder
        predictions, hidden, _ = decoder(dec_input, features, hidden)
        predicted_id = tf.argmax(predictions,axis=1).numpy()
        val_loss += loss_function(target[:, i], predictions)
        result = np.concatenate((result, predicted_id.reshape((BATCH_SIZE,1))), axis=1)
        dec_input = tf.expand_dims(predicted_id, 1)
    target_array = target.numpy()
    total_val_loss += (val_loss / int(target.shape[1]))
    for i in range(BATCH_SIZE):
        for j in range(max_len):
            if result[i][j] == 28 and target_array[i][j] == 28:
                if (result[i][1:j] == target_array[i][1:j]).all():
                    equal_num+=1
                break
    print ('Validation Accuracy {:.6f}, Validation Loss {:.6f}'.format(float(equal_num)/((batch+1)*BATCH_SIZE),total_val_loss/(batch+1)),end='\r')

Validation Accuracy 0.974600, Validation Loss 0.020613

## Prediction

In [34]:
test_img_name = []

for i in range(120000,140000):
    test_img_name.append('a'+str(i))

print(len(test_img_name))

20000


In [35]:
def load_test_image(image_name):
    img = tf.io.read_file(IMAGE_DIR + image_name + '.png')
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, (IMAGE_HEIGHT, IMAGE_WIDTH))
    img = img/255 - 1.
    return img

In [36]:
test_dataset = tf.data.Dataset.from_tensor_slices(test_img_name)
test_dataset = test_dataset.map(load_test_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE)
test_dataset = test_dataset.prefetch(200)

In [37]:
test_dataset

<PrefetchDataset shapes: (None, 300, 160, 3), types: tf.float32>

In [40]:
num=0
for batch, img_tensor in enumerate(test_dataset):
    hidden = decoder.reset_state(batch_size=BATCH_SIZE)
    dec_input = tf.expand_dims([character_to_idx['<start>']]*BATCH_SIZE, 1)
    features = feature_extracter(img_tensor,False)
    features = tf.reshape(features,(features.shape[0], -1, features.shape[3]))
    features = encoder(features)
    result = np.full((BATCH_SIZE, 1), 27)
    for i in range(1, max_len):
        # passing the features through the decoder
        predictions, hidden, _ = decoder(dec_input, features, hidden)
        predicted_id = tf.argmax(predictions,axis=1).numpy()
        result = np.concatenate((result, predicted_id.reshape((BATCH_SIZE,1))), axis=1)
        dec_input = tf.expand_dims(predicted_id, 1)
    for i in range(BATCH_SIZE):
        output_str = ''
        num = num+1
        hit = False
        for j in range(1,max_len):
            if result[i][j] == 28:
                hit = True
                break
            else:
                output_str = output_str + idx_to_character[result[i][j]]
        if hit != True:
            print(num)
        with open('./Lab13-2_109062562.txt','a') as f:
            f.write('a' + str(119999 + num) + ' ' + output_str+'\n')
        f.close()
print(num)

12117
12996
13104
20000


## Report

#### Preprocess

In the preprocess part, I first read in the \'spec_train_val.txt\' and split image names and annotations into train and validation list.

Later, I create to dictionary, named character_to_idx and idx_to_character, to transfer the annotations into indices, character by character. At the same time, concate the \'start\' and \'end\' indices, and padding 0, which represent       \'padding\' at the end of annotation index sequences which are shorter than 7 (max sequence length).

#### Reading the images

When reading the images, I first resize the image size into 300\*160 (to preserve the original ratio), and then convert the image channel value into -1~1.

#### CNN model

I design my model architecture based on CNN layers from vgg16 and add the Batchnormalization layers between CNN layers and activation layers (for the activation layers, I use leaky relu). After the 13 convolution layers, I add three more convolution layers which contain 1024 filters.

#### Training

In the train_step, I first send the images into CNN model I design, and then reshape the output into (None,45,1024). Then, the reshaped result is sent into CNN_Encoder to get the embedding.

Later on send the embedding into the decoder to gain the prediction, and calculate loss function on the prediction and true value.

The worth mention part is that, when doing the validation, we can't use teacher forcing mechanism, so I change the decoder input from target\[:,i\] to tf.expand_dims(tf.argmax(predictions,axis=1).numpy(),1), the indices the model predicted.

The model have best validation accuracy and loss at 9th epoch. The accuracy is 0.97 and the loss is 0.020613. I use the weight to make prediction on test data.

#### Problem I encountered, and how I solve it

In my experiments, I actually encounter that my training loss will stuck at 2. Later on, I find that the reason is because the learning rate I use is too big, which is 0.001. It makes the model swing around the optimal spot.

I then slove it by using the smaller learning rate, which is 5e-5.
