In [1]:
import os

import numpy as np
import pandas
import matplotlib.pyplot as plt
%matplotlib inline
import string
import PIL
import glob
from pickle import dump, load
from time import time
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Model
from tensorflow.keras.applications.inception_v3 import preprocess_input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import os


In [2]:

def load_doc(filename):
    #open the file as read only
    file = open(filename, 'r')
    #read all text
    text = file.read()
    #close the file
    file.close()
    return text

# save caption in format of key value
# id_image : ['caption 1', 'caption 2', 'caption 3',' caption 4', 'caption 5']

def load_descriptions(doc):
    mapping = dict()
    #process lines
    for line in doc.split('\n'):
        # split line by white space
        tokens = line.split()
        if len(line) < 2:
            continue
        # take the first token as the image id, the rest as the description
        image_id, image_desc = tokens[0], tokens[1:]
        # extract filename from image id
        image_id = image_id.split('.')[0]
        # convert description tokens back to string
        image_desc = ''.join(image_desc)
        # create the list if needed
        if image_id not in mapping:
            mapping[image_id] = list()
        # store description
        mapping[image_id].append(image_desc)
    return mapping

# Preprocessing text
def clean_description(description):
    # prepare translation table for removing punctuation
    table = str.maketrans('','',string.punctuation)
    for key, desc_list in description.items():
        for i in range(len(desc_list)):
            desc = desc_list[i]
            # tokenize
            desc = desc.split()
            # convert to lower case
            desc = [word.lower() for word in desc]
            # remove punctuation from each token
            desc = [w.translate(table) for w in desc]
            # remove hanging 's' and 'a'
            desc = [word for word in desc if len(word) > 1]
            # remve tokens with numbers in them
            desc = [word for word in desc if word.isalpha()]
            # store as string
            desc_list[i] = ''.join(desc)

def save_description(descriptions, filename):
    lines = list()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + ' ' + desc)
            data = '\n'.join(lines)
            file = open(filename, 'w')
            file.write(data)
            file.close()





In [3]:
filename = "/home/quan/PycharmProjects/DL_Python/Flickr8k/Flickr8k_text/Flickr8k.token.txt"
doc = load_doc(filename)
descriptions = load_descriptions(doc)
clean_description(descriptions)
save_description(descriptions,'descriptions.txt')

In [4]:
# take image id correspond to data train, test, dev
def load_set(filename):
    doc = load_doc(filename)
    dataset = list()
    for line in doc.split('\n'):
        # skip empty lines
        if len(line) < 1:
            continue
        # get the image identifier
        identifier = line.split('.')[0]
        dataset.append(identifier)
    return set(dataset)


In [5]:
filename = '/home/quan/PycharmProjects/DL_Python/Flickr8k/Flickr8k_text/Flickr_8k.trainImages.txt'
train = load_set(filename)


images = '/home/quan/PycharmProjects/DL_Python/Flickr8k/Flicker8k_Dataset/'
# read image
img = glob.glob(images + '*.jpg')
print(img[:10])
# read file contain image id for training
train_images = set(open(filename, 'r').read().strip().split('\n'))

# create list of all training images with their full path names
train_img = []

for i in img:
    if i[len(images):] in train_images:
        train_img.append(i)

# File chứa các id ảnh để test
test_images_file = '/home/quan/PycharmProjects/DL_Python/Flickr8k/Flickr8k_text/Flickr_8k.testImages.txt'
# Read the validation image names in a set# Read the test image names in a set
test_images = set(open(test_images_file, 'r').read().strip().split('\n'))
# Create a list of all the test images with their full path names
test_img = []
for i in img: # img is list of full path names of all images
    if i[len(images):] in test_images: # Check if the image belongs to test set
        test_img.append(i) # Add it to the list of test images

['/home/quan/PycharmProjects/DL_Python/Flickr8k/Flicker8k_Dataset/3087485737_cb09bc80b6.jpg', '/home/quan/PycharmProjects/DL_Python/Flickr8k/Flicker8k_Dataset/1324816249_86600a6759.jpg', '/home/quan/PycharmProjects/DL_Python/Flickr8k/Flicker8k_Dataset/3591170729_406fdb74e5.jpg', '/home/quan/PycharmProjects/DL_Python/Flickr8k/Flicker8k_Dataset/540338917_57069687be.jpg', '/home/quan/PycharmProjects/DL_Python/Flickr8k/Flicker8k_Dataset/397982550_cf9f5cdb74.jpg', '/home/quan/PycharmProjects/DL_Python/Flickr8k/Flicker8k_Dataset/418357172_bdddf71d32.jpg', '/home/quan/PycharmProjects/DL_Python/Flickr8k/Flicker8k_Dataset/3590593467_be497a6139.jpg', '/home/quan/PycharmProjects/DL_Python/Flickr8k/Flicker8k_Dataset/3216762979_813c45a8ec.jpg', '/home/quan/PycharmProjects/DL_Python/Flickr8k/Flicker8k_Dataset/2730819220_b58af1119a.jpg', '/home/quan/PycharmProjects/DL_Python/Flickr8k/Flicker8k_Dataset/3025513877_1a6160070d.jpg']


In [6]:
# add 'startseq', 'endseq' in each sequence
def load_clean_descriptions(filename, dataset):
    # load document
    doc = load_doc(filename)
    descriptions = dict()
    for line in doc.split('\n'):
        # split line by white space
        tokens = line.split()
        # split id from descriptions
        image_id, image_desc = tokens[0], tokens[1:]
        # skip images not in the s
        if image_id in dataset:
            # create list
            if image_id not in descriptions:
                descriptions[image_id] = list()
            # wrap descriptions in tokens
            desc = 'startseq ' + ''.join(image_desc) + ' endseq'

            descriptions[image_id].append(desc)
    return descriptions

train_description = load_clean_descriptions('descriptions.txt', train)

In [7]:
# load image and resize to Inception-v3 required
def preprocess(image_path):
    # convert all images to size 299x299
    img = image.load_img(image_path, target_size=(299, 299))
    # convert image to vector
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    return x

# load the inception v3 model
model = InceptionV3(weights='imagenet')

new_model = Model(model.input, model.layers[-2].output)

# Image embedding to vector (2048,)
def encode(image):
    image = preprocess(image)
    fea_vec = new_model.predict(image)
    fea_vec = np.reshape(fea_vec, fea_vec.shape[1])
    return fea_vec

start = time()
encoding_train = {}
for img in train_img:
    encoding_train[img[len(images) : ]] = encode(img)
print("Time taken in seconds = ", time() - start)

# Save image embedding
with open('/home/quan/PycharmProjects/DL_Python/Flickr8k/Pickle/encoded_images_train.pkl', 'wb') as encoded_pickle:
    dump(encoding_train, encoded_pickle)


# Encode test image
start = time()
encoding_test = {}
for img in test_img:
    encoding_test[img[len(images):]] = encode(img)


# Save the bottleneck test features to disk
with open("Flickr8k/Pickle/encoded_test_images.pkl", "wb") as encoded_pickle:
    dump(encoding_test, encoded_pickle)

Time taken in seconds =  397.8975234031677


In [8]:
train_features = load(open("/home/quan/PycharmProjects/DL_Python/Flickr8k/Pickle/encoded_images_train.pkl", "rb"))
print('Photos: train=%d' % len(train_features))


# Tạo list các training caption
all_train_captions = []
for key, val in train_description.items():
    for cap in val:
        all_train_captions.append(cap)
len(all_train_captions)
# Chỉ lấy các từ xuất hiện trên 10 lần
word_count_threshold = 10
word_counts = {}
nsents = 0
for sent in all_train_captions:
    nsents += 1
    for w in sent.split(' '):
        word_counts[w] = word_counts.get(w, 0) + 1
vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
print('preprocessed words %d -> %d' % (len(word_counts), len(vocab)))
ixtoword = {}
wordtoix = {}
ix = 1
for w in vocab:
    wordtoix[w] = ix
    ixtoword[ix] = w
    ix += 1
vocab_size = len(ixtoword) + 1 # Thêm 1 cho từ dùng để padding
vocab_size
# convert a dictionary of clean descriptions to a list of descriptions
def to_lines(descriptions):
    all_desc = list()
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc
# calculate the length of the description with the most words
def max_length(descriptions):
    lines = to_lines(descriptions)
    return max(len(d.split()) for d in lines)
# determine the maximum sequence length
max_length = max_length(train_description)
print('Description Length: %d' % max_length)
# data generator cho việc train theo từng batch model.fit_generator()
def data_generator(descriptions, photos, wordtoix, max_length, num_photos_per_batch):
    X1, X2, y = list(), list(), list()
    n=0
    # loop for ever over images
    while 1:
        for key, desc_list in descriptions.items():
            n+=1
            # retrieve the photo feature
            photo = photos[key+'.jpg']
            for desc in desc_list:
                # encode the sequence
                seq = [wordtoix[word] for word in desc.split(' ') if word in wordtoix]
                # split one sequence into multiple X, y pairs
                for i in range(1, len(seq)):
                    # split into input and output pair
                    in_seq, out_seq = seq[:i], seq[i]
                    # pad input sequence
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    # encode output sequence
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    # store
                    X1.append(photo)
                    X2.append(in_seq)
                    y.append(out_seq)
            # yield the batch data
            if n==num_photos_per_batch:
                yield [[np.array(X1), np.array(X2)], np.array(y)]
                X1, X2, y = list(), list(), list()
                n=0

Photos: train=6000
preprocessed words 29662 -> 3
Description Length: 3


In [11]:
# Load Glove model
glove_dir = ''
embeddings_index = {} # empty dictionary
f = open(os.path.join(glove_dir, 'glove.6B.200d.txt'), encoding="utf-8")

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [16]:
print('Found %s word vectors.' % len(embeddings_index))
embeddings_index['the']
embedding_dim = 200
# Get 200-dim dense vector for each of the 10000 words in out vocabulary
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in wordtoix.items():
    #if i < max_words:
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in the embedding index will be all zeros
        embedding_matrix[i] = embedding_vector

# Create model
inputs1 = layers.Input(shape=(2048,))
fe1 = layers.Dropout(0.5)(inputs1)
fe2 = layers.Dense(256, activation='swish')(fe1)
inputs2 = layers.Input(shape=(max_length,))
se1 = layers.Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
se2 = layers.Dropout(0.5)(se1)
se3 = layers.LSTM(256)(se2)
decoder1 = layers.add([fe2,se3])
decoder2 = layers.Dense(256, activation='swish')(decoder1)
outputs = layers.Dense(vocab_size, activation='softmax')(decoder2)
model = Model(inputs=[inputs1, inputs2], outputs=outputs)

model.summary()
from tensorflow.keras.utils import plot_model
plot_model(model, show_shapes=True)

Found 400000 word vectors.
Model: "functional_9"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_13 (InputLayer)           [(None, 3)]          0                                            
__________________________________________________________________________________________________
input_12 (InputLayer)           [(None, 2048)]       0                                            
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, 3, 200)       800         input_13[0][0]                   
__________________________________________________________________________________________________
dropout_10 (Dropout)            (None, 2048)         0           input_12[0][0]                   
____________________________________________________________

In [17]:
# layer 2 use GLOVE model so we ser weight for it and no need to train
model.layers[2].set_weights([embedding_matrix])
model.layers[2].trainable = False

model.compile(loss='categorical_crossentropy', optimizer='adam')
model.optimizer.lr = 0.0001
epochs = 10
number_pics_per_bath = 6
steps = len(train_description)//number_pics_per_bath

for i in range(epochs):
    generator = data_generator(train_description, train_features, wordtoix, max_length, number_pics_per_bath)
    model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)

model.save_weights('/home/quan/PycharmProjects/DL_Python/models/model_weights/model_30.h5')

Instructions for updating:
Please use Model.fit, which supports generators.


ValueError: in user code:

    /home/quan/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:806 train_function  *
        return step_function(self, iterator)
    /home/quan/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:796 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /home/quan/.local/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:1211 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /home/quan/.local/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:2585 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /home/quan/.local/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:2945 _call_for_each_replica
        return fn(*args, **kwargs)
    /home/quan/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:789 run_step  **
        outputs = model.train_step(data)
    /home/quan/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:757 train_step
        self.trainable_variables)
    /home/quan/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:2737 _minimize
        trainable_variables))
    /home/quan/.local/lib/python3.6/site-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:562 _aggregate_gradients
        filtered_grads_and_vars = _filter_grads(grads_and_vars)
    /home/quan/.local/lib/python3.6/site-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:1271 _filter_grads
        ([v.name for _, v in grads_and_vars],))

    ValueError: No gradients provided for any variable: ['dense_15/kernel:0', 'dense_15/bias:0', 'lstm_5/lstm_cell_5/kernel:0', 'lstm_5/lstm_cell_5/recurrent_kernel:0', 'lstm_5/lstm_cell_5/bias:0', 'dense_16/kernel:0', 'dense_16/bias:0', 'dense_17/kernel:0', 'dense_17/bias:0'].
