In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import os
import string
import glob
import numpy as np
from time import time
from pickle import dump, load

from keras.utils import load_img, img_to_array, pad_sequences, to_categorical
from keras.applications.inception_v3 import preprocess_input, InceptionV3
from keras import Model, Input

In [3]:
WORKDIR = "/content/drive/MyDrive/Image_captionning"
IMAGE_FOLDER = f"{WORKDIR}/Flickr8k/Flicker8k_Dataset"
PICKLE_FOLDER = f"{WORKDIR}/Flickr8k/Pickle"
GLOVE_DIR =  f"{WORKDIR}/glove"

In [4]:
!apt-get install tree

Reading package lists... Done
Building dependency tree       
Reading state information... Done
tree is already the newest version (1.8.0-1).
0 upgraded, 0 newly installed, 0 to remove and 24 not upgraded.


In [5]:
!tree /content/drive/MyDrive/Image_captionning/Flickr8k

[1;30;43mKết quả truyền trực tuyến bị cắt bớt đến 5000 dòng cuối.[0m
│   ├── [01;35m2846037553_1a1de50709.jpg[00m
│   ├── [01;35m2846785268_904c5fcf9f.jpg[00m
│   ├── [01;35m2846843520_b0e6211478.jpg[00m
│   ├── [01;35m2847514745_9a35493023.jpg[00m
│   ├── [01;35m2847615962_c330bded6e.jpg[00m
│   ├── [01;35m2847859796_4d9cb0d31f.jpg[00m
│   ├── [01;35m2848266893_9693c66275.jpg[00m
│   ├── [01;35m2848571082_26454cb981.jpg[00m
│   ├── [01;35m2848895544_6d06210e9d.jpg[00m
│   ├── [01;35m2848977044_446a31d86e.jpg[00m
│   ├── [01;35m2849194983_2968c72832.jpg[00m
│   ├── [01;35m2850719435_221f15e951.jpg[00m
│   ├── [01;35m2851198725_37b6027625.jpg[00m
│   ├── [01;35m2851304910_b5721199bc.jpg[00m
│   ├── [01;35m2851931813_eaf8ed7be3.jpg[00m
│   ├── [01;35m2852982055_8112d0964f.jpg[00m
│   ├── [01;35m285306009_f6ddabe687.jpg[00m
│   ├── [01;35m2853205396_4fbe8d7a73.jpg[00m
│   ├── [01;35m2853407781_c9fea8eef4.jpg[00m
│   ├── [01;35m2853743795_e90ebc669d

## Read caption file

In [6]:
def load_doc(filename):
  file = open(filename, "r")
  text = file.read()
  file.close()
  return text

cap_list_file = f"{WORKDIR}/Flickr8k/Flickr8k.token.txt"

doc = load_doc(cap_list_file)

doc[:101]

'1000268201_693b08cb0e.jpg#0\tA child in a pink dress is climbing up a set of stairs in an entry way .\n'

Chuyển lại thành dạng dict: id_image: [cap1, cap2, cap3, cap4, cap5]

In [7]:
def load_description(doc):
  mapping = dict()
  for line in doc.split("\n"):
    if len(line) < 2:
      continue
    # Split image name and caption
    image_str, caption = line.split("\t")
    img_id = image_str.split(".")[0]
    if img_id not in mapping:
      mapping[img_id] = []
    mapping[img_id].append(caption)

  return mapping


descriptions = load_description(doc)
print(f"Loaded {len(descriptions)}")

    

Loaded 8092


In [8]:
descriptions['1000268201_693b08cb0e']

['A child in a pink dress is climbing up a set of stairs in an entry way .',
 'A girl going into a wooden building .',
 'A little girl climbing into a wooden playhouse .',
 'A little girl climbing the stairs to her playhouse .',
 'A little girl in a pink dress going into a wooden cabin .']

## Preprocessing caption

1. Tokenize các từ (vd: dog -> d o g)
2. Chuyển các từ thành lower case
3. Loại bỏ hết các dấu câu
4. Loại bỏ hết các từ là a hoặc sở hữu cách s
5. Loại bỏ các từ có chứa số (vd: 4, ab4, 4b...)

In [9]:
def clean_description(descriptions):
  # Using maketrans to prepare a translation table for removing punctuation
  table = str.maketrans("", "", string.punctuation)

  for key, desc_list in descriptions.items():
    for i in range(len(desc_list)):
      desc = desc_list[i]

      # Tokenize
      desc = desc.split()
      # convert to lower case
      desc = [word.lower() for word in desc]
      # Remove punctuation from each token
      desc = [w.translate(table) for w in desc]
      # Remove hanging 's' and 'a'
      desc = [word for word in desc if len(word) > 1]
      # Remove tokens with numbers in them
      desc = [word for word in desc if word.isalpha()]
      # store as string
      desc_list[i] = ' '.join(desc)

clean_description(descriptions)

In [10]:
descriptions['1000268201_693b08cb0e']

['child in pink dress is climbing up set of stairs in an entry way',
 'girl going into wooden building',
 'little girl climbing into wooden playhouse',
 'little girl climbing the stairs to her playhouse',
 'little girl in pink dress going into wooden cabin']

## Lưu description (dict chứa image_id và caption của nó) thành file .txt

In [11]:
def save_descriptions(descriptions, filename):
  lines = []
  for key, desc_list in descriptions.items():
    lines += [f"{key} {desc}" for desc in desc_list]

    data = '\n'.join(lines)
  with open(filename, "w") as f:
    f.write(data)
    f.close()

DESCRIPTION_FILENAME = f"{WORKDIR}/Flickr8k/descriptions.txt"
save_descriptions(descriptions, f"{WORKDIR}/Flickr8k/descriptions.txt")

## Load dataset image

In [12]:
# Load image_id từ các file .txt: Flickr_8k.devImages.txt, Flickr_8k.testImages.txt, Flickr_8k.trainImages.txt
def load_set(filename):
  lines = load_doc(filename).split("\n")
  dataset = list()

  for line in lines:
    # skip empty line
    if len(line) < 1:
      continue
    img_id = line.split(".")[0]
    dataset.append(img_id)
  return set(dataset)



In [13]:
all_imgs = glob.glob(IMAGE_FOLDER + "/*.jpg")

all_imgs

['/content/drive/MyDrive/Image_captionning/Flickr8k/Flicker8k_Dataset/3398746625_5199beea71.jpg',
 '/content/drive/MyDrive/Image_captionning/Flickr8k/Flicker8k_Dataset/2635164923_2a774f7854.jpg',
 '/content/drive/MyDrive/Image_captionning/Flickr8k/Flicker8k_Dataset/467858872_f3431df682.jpg',
 '/content/drive/MyDrive/Image_captionning/Flickr8k/Flicker8k_Dataset/412082368_371df946b3.jpg',
 '/content/drive/MyDrive/Image_captionning/Flickr8k/Flicker8k_Dataset/311146855_0b65fdb169.jpg',
 '/content/drive/MyDrive/Image_captionning/Flickr8k/Flicker8k_Dataset/3286198467_8880be127e.jpg',
 '/content/drive/MyDrive/Image_captionning/Flickr8k/Flicker8k_Dataset/1805990081_da9cefe3a5.jpg',
 '/content/drive/MyDrive/Image_captionning/Flickr8k/Flicker8k_Dataset/1835511273_790eaae6e6.jpg',
 '/content/drive/MyDrive/Image_captionning/Flickr8k/Flicker8k_Dataset/96420612_feb18fc6c6.jpg',
 '/content/drive/MyDrive/Image_captionning/Flickr8k/Flicker8k_Dataset/3258874419_23fec1bdc1.jpg',
 '/content/drive/MyDrive/

In [14]:
def get_list_image(img_filename):
  list_img_files = set(load_doc(img_filename).strip().split("\n"))
  list_img = [img for img in all_imgs
              if img[len(IMAGE_FOLDER) + 1 : ] in list_img_files]
  print(f"Found {len(list_img)} images in {img_filename}")

  return list_img

# def load_ids(filename):
#   lines = load_doc(filename).strip().split("\n")
#   lines = [line.split(".")[0] for line in lines if len(line) > 1]

#   return set(lines)

def load_ids(dataset):
  ids = [img[len(IMAGE_FOLDER) + 1: ].split(".")[0]
            for img in dataset]
  return set(ids)

Load train images

In [15]:
train_images_file = f"{WORKDIR}/Flickr8k/Flickr_8k.trainImages.txt"

train_img = get_list_image(train_images_file)
train_ids = load_ids(train_img)


Found 5985 images in /content/drive/MyDrive/Image_captionning/Flickr8k/Flickr_8k.trainImages.txt


Load test images

In [16]:
test_images_file = f"{WORKDIR}/Flickr8k/Flickr_8k.testImages.txt"

test_img = get_list_image(test_images_file)
test_ids = load_ids(test_img)

Found 997 images in /content/drive/MyDrive/Image_captionning/Flickr8k/Flickr_8k.testImages.txt


## Thêm 'startseq' và 'endseq' cho chuỗi

In [17]:
def load_clean_description(filename, dataset):
  doc = load_doc(filename)
  descriptions = dict()

  for line in doc.split("\n"):
    tokens = line.split()

    image_id, image_desc = tokens[0], tokens[1: ]

    # Skip image not in the set
    if image_id in dataset:
      if image_id not in descriptions:
        descriptions[image_id] = []
      descriptions[image_id].append(f"startseq {' '.join(image_desc)} endseq")
  
  return descriptions

# train descirptions
train_descriptions = load_clean_description(DESCRIPTION_FILENAME, train_ids)
print('Descriptions: train=%d' % len(train_descriptions))

Descriptions: train=5985


## Image preprocessing

In [18]:
def preprocess(img_path):
  # Convert images to size 299x299 for feed in inception v3 model
  img = load_img(img_path, target_size=(299, 299))

  # Conver PIL image to numpy
  x = img_to_array(img)

  # Add one more dimension
  # Before : (w, h, c)
  # After : (1, w, h, c)
  x = np.expand_dims(x, axis=0)
  print(x.shape)

  # preprocess the images using preprocess_input() from incpetion module
  x = preprocess_input(x)

  return x


## Load model

In [19]:
model = InceptionV3(weights='imagenet')

In [20]:
model.layers

[<keras.engine.input_layer.InputLayer at 0x7f404a4fca60>,
 <keras.layers.convolutional.conv2d.Conv2D at 0x7f404cbc9e70>,
 <keras.layers.normalization.batch_normalization.BatchNormalization at 0x7f404cbcad10>,
 <keras.layers.core.activation.Activation at 0x7f404cbcb1c0>,
 <keras.layers.convolutional.conv2d.Conv2D at 0x7f404cbcb310>,
 <keras.layers.normalization.batch_normalization.BatchNormalization at 0x7f404a348f10>,
 <keras.layers.core.activation.Activation at 0x7f404a349570>,
 <keras.layers.convolutional.conv2d.Conv2D at 0x7f404a349ff0>,
 <keras.layers.normalization.batch_normalization.BatchNormalization at 0x7f404a34b1f0>,
 <keras.layers.core.activation.Activation at 0x7f404a3488b0>,
 <keras.layers.pooling.max_pooling2d.MaxPooling2D at 0x7f404a34b8b0>,
 <keras.layers.convolutional.conv2d.Conv2D at 0x7f404a34bc40>,
 <keras.layers.normalization.batch_normalization.BatchNormalization at 0x7f404a34a5f0>,
 <keras.layers.core.activation.Activation at 0x7f404a3484c0>,
 <keras.layers.convo

In [21]:
model_new = Model(model.input, model.layers[-2].output)

## Image embedding using inception v3

In [22]:
def encode(image):
  image = preprocess(image)
  fea_vec = model_new.predict(image)
  # Reshape from (1,2048) to (2048,)
  fea_vec = np.reshape(fea_vec, fea_vec.shape[1])

  return fea_vec

def path_to_id(pathfile):
  return pathfile[len(IMAGE_FOLDER) + 1: ]

In [23]:

# start = time()
# encoding_train = {}

# for img in train_img:
#   encoding_train[path_to_id(img)] = encode(img)

# print(f"Time taken in second = {time() - start}")

# # Save embedding
# with open(f"{PICKLE_FOLDER}/encoded_train_images.pkl","wb") as encoded_pickle:
#   dump(encoding_train, encoded_pickle)


Encode test image

In [24]:
# start = time()
# encoding_test = {}

# for img in test_img:
#   encoding_test[path_to_id(img)] = encode(img)

# print(f"Time taken in second = {time() - start}")

# # Save embedding test
# with open(f"{PICKLE_FOLDER}/encoded_test_images.pkl","wb") as encoded_pickle:
#   dump(encoding_test, encoded_pickle)

## Load train features

In [25]:
train_features = load(open(f"{PICKLE_FOLDER}/encoded_train_images.pkl", "rb"))

print(f"Photos: train={len(train_features)}")

Photos: train=5985


## Tạo list training caption

In [26]:
all_train_captions = []
for cap_list in train_descriptions.values():
  all_train_captions += cap_list

len(all_train_captions)

29925

Chỉ lấy các từ xuất hiện trên 10 lần

In [27]:
word_count_threshold = 10
word_counts = {}
nsents = len(all_train_captions)

for sent in all_train_captions:
  for word in sent.split(" "):
    # Count word appeaerd, if not exist count = 0
    word_counts[word] = word_counts.get(word, 0) + 1

vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]

print(f"Preprocesses word {len(word_counts)} -> {len(vocab)}")

Preprocesses word 7570 -> 1648


Bảng lookup table cho các từ

In [28]:
ixtoword = {}
wordtoix = {}

ix = 1
for w in vocab:
  wordtoix[w] = ix
  ixtoword[ix] = w
  ix +=1

In [29]:
vocab_size = len(ixtoword) + 1 #Thêm 1 cho từ dùng đẻ padding
vocab_size

1649

In [30]:
# convert a dictionary of clean descriptions to a list of descriptions
def to_lines(descriptions):
  all_desc = []
  for list_desc in descriptions.values():
    all_desc +=list_desc
  return all_desc

# Calculate the length of the description with the most words
def max_length(descriptions):
  lines = to_lines(descriptions)
  return max(len(d.split()) for d in lines)

# determine the maximum sequence length
max_length = max_length(train_descriptions)
print('Description Length: %d' % max_length)

Description Length: 34


In [31]:
# data generator cho việc train theo từng batch model.fit_generator()
def data_generator(descriptions, photos, wordtoix, max_length, num_photos_per_batch):
  X1, X2, y = list(), list(), list()
  n = 0
  # loop for ever over images
  while True:
    for key, desc_list in descriptions.items():
      n=+1
      # retreive the photo feature
      photo = photos[key+'.jpg']
      for desc in desc_list:
        # encode the sequence
        seq = [wordtoix[word] for word in desc.split(" ") if word in wordtoix]

        # Split one sequence into multiple X, y pairs
        for i in range(1, len(seq)):
          # split into input and output pair
          in_seq, out_seq = seq[:i], seq[i]

          # Pad input sequence
          in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
          # Encode output sequence
          out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]

          # store
          X1.append(photo)
          X2.append(in_seq)
          y.append(out_seq)
      # yield the batch data
      if n == num_photos_per_batch:
        yield [[np.array(X1), np.array(X2), np.array(y)]]
        X1, X2, y = list(), list(), list()
        n = 0



## Chuẩn bị word2vec model là Glove

In [32]:
embeddings_index = {}

with open(f"{GLOVE_DIR}/glove.6B.200d.txt", encoding="utf-8") as f:
  for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
  f.close()

print(f"Found {len(embeddings_index)} word vector")

Found 400000 word vector


In [33]:
embeddings_index['the']

array([-7.1549e-02,  9.3459e-02,  2.3738e-02, -9.0339e-02,  5.6123e-02,
        3.2547e-01, -3.9796e-01, -9.2139e-02,  6.1181e-02, -1.8950e-01,
        1.3061e-01,  1.4349e-01,  1.1479e-02,  3.8158e-01,  5.4030e-01,
       -1.4088e-01,  2.4315e-01,  2.3036e-01, -5.5339e-01,  4.8154e-02,
        4.5662e-01,  3.2338e+00,  2.0199e-02,  4.9019e-02, -1.4132e-02,
        7.6017e-02, -1.1527e-01,  2.0060e-01, -7.7657e-02,  2.4328e-01,
        1.6368e-01, -3.4118e-01, -6.6070e-02,  1.0152e-01,  3.8232e-02,
       -1.7668e-01, -8.8153e-01, -3.3895e-01, -3.5481e-02, -5.5095e-01,
       -1.6899e-02, -4.3982e-01,  3.9004e-02,  4.0447e-01, -2.5880e-01,
        6.4594e-01,  2.6641e-01,  2.8009e-01, -2.4625e-02,  6.3302e-01,
       -3.1700e-01,  1.0271e-01,  3.0886e-01,  9.7792e-02, -3.8227e-01,
        8.6552e-02,  4.7075e-02,  2.3511e-01, -3.2127e-01, -2.8538e-01,
        1.6670e-01, -4.9707e-03, -6.2714e-01, -2.4904e-01,  2.9713e-01,
        1.4379e-01, -1.2325e-01, -5.8178e-02, -1.0290e-03, -8.21

In [34]:
embedding_dim = 200

# Get 200-dim dense vector for each of 10000 words in vocabulary
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in wordtoix.items():
  embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

embedding_matrix.shape

(1649, 200)

## Tạo model

In [35]:
from keras.layers import Dropout, Dense, Embedding, LSTM, add
# Shape của output embedding từ inceptionv3
inputs1 = Input(shape=(2048,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)
inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)

decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)
model = Model(inputs=[inputs1, inputs2], outputs=outputs)

In [36]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 34)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 2048)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, 34, 200)      329800      ['input_3[0][0]']                
                                                                                                  
 dropout (Dropout)              (None, 2048)         0           ['input_2[0][0]']                
                                                                                            

Dùng pretrained cho layer 2 là GLOVE Model

In [37]:
model.layers[2].set_weights([embedding_matrix])
model.layers[2].trainable = False

In [38]:
model.compile(loss='categorical_crossentropy', optimizer='adam')


In [39]:
model.optimizer.lr = 0.0001
epochs = 10
number_pics_per_bath = 6
steps = len(train_descriptions)//number_pics_per_bath

In [None]:
for i in range(epochs):
    generator = data_generator(train_descriptions, train_features, wordtoix, max_length, number_pics_per_bath)
    model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)

  model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)


In [None]:
model.save_weights(f'{WORKDIR}/model_weights/model_30.h5')