In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [24]:
import pandas as pd
import os
import string
import glob
import numpy as np
from time import time
from pickle import dump, load

from keras.utils import load_img, img_to_array
from keras.applications.inception_v3 import preprocess_input, InceptionV3
from keras import Model

In [27]:
WORKDIR = "/content/drive/MyDrive/Image_captionning"
IMAGE_FOLDER = f"{WORKDIR}/Flickr8k/Flicker8k_Dataset"
PICKLE_FOLDER = f"{WORKDIR}/Flickr8k/Pickle"

In [4]:
!apt-get install tree

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  tree
0 upgraded, 1 newly installed, 0 to remove and 24 not upgraded.
Need to get 43.0 kB of archives.
After this operation, 115 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu focal/universe amd64 tree amd64 1.8.0-1 [43.0 kB]
Fetched 43.0 kB in 1s (54.7 kB/s)
Selecting previously unselected package tree.
(Reading database ... 122518 files and directories currently installed.)
Preparing to unpack .../tree_1.8.0-1_amd64.deb ...
Unpacking tree (1.8.0-1) ...
Setting up tree (1.8.0-1) ...
Processing triggers for man-db (2.9.1-1) ...


In [5]:
!tree /content/drive/MyDrive/Image_captionning/Flickr8k

[1;30;43mKết quả truyền trực tuyến bị cắt bớt đến 5000 dòng cuối.[0m
│   ├── [01;35m2845246160_d0d1bbd6f0.jpg[00m
│   ├── [01;35m2845691057_d4ab89d889.jpg[00m
│   ├── [01;35m2845845721_d0bc113ff7.jpg[00m
│   ├── [01;35m2846037553_1a1de50709.jpg[00m
│   ├── [01;35m2846785268_904c5fcf9f.jpg[00m
│   ├── [01;35m2846843520_b0e6211478.jpg[00m
│   ├── [01;35m2847514745_9a35493023.jpg[00m
│   ├── [01;35m2847615962_c330bded6e.jpg[00m
│   ├── [01;35m2847859796_4d9cb0d31f.jpg[00m
│   ├── [01;35m2848266893_9693c66275.jpg[00m
│   ├── [01;35m2848571082_26454cb981.jpg[00m
│   ├── [01;35m2848895544_6d06210e9d.jpg[00m
│   ├── [01;35m2848977044_446a31d86e.jpg[00m
│   ├── [01;35m2849194983_2968c72832.jpg[00m
│   ├── [01;35m2850719435_221f15e951.jpg[00m
│   ├── [01;35m2851198725_37b6027625.jpg[00m
│   ├── [01;35m2851304910_b5721199bc.jpg[00m
│   ├── [01;35m2851931813_eaf8ed7be3.jpg[00m
│   ├── [01;35m2852982055_8112d0964f.jpg[00m
│   ├── [01;35m285306009_f6ddabe687

## Read caption file

In [6]:
def load_doc(filename):
  file = open(filename, "r")
  text = file.read()
  file.close()
  return text

cap_list_file = f"{WORKDIR}/Flickr8k/Flickr8k.token.txt"

doc = load_doc(cap_list_file)

doc[:101]

'1000268201_693b08cb0e.jpg#0\tA child in a pink dress is climbing up a set of stairs in an entry way .\n'

Chuyển lại thành dạng dict: id_image: [cap1, cap2, cap3, cap4, cap5]

In [7]:
def load_description(doc):
  mapping = dict()
  for line in doc.split("\n"):
    if len(line) < 2:
      continue
    # Split image name and caption
    image_str, caption = line.split("\t")
    img_id = image_str.split(".")[0]
    if img_id not in mapping:
      mapping[img_id] = []
    mapping[img_id].append(caption)

  return mapping


descriptions = load_description(doc)
print(f"Loaded {len(descriptions)}")

    

Loaded 8092


In [8]:
descriptions['1000268201_693b08cb0e']

['A child in a pink dress is climbing up a set of stairs in an entry way .',
 'A girl going into a wooden building .',
 'A little girl climbing into a wooden playhouse .',
 'A little girl climbing the stairs to her playhouse .',
 'A little girl in a pink dress going into a wooden cabin .']

## Preprocessing caption

1. Tokenize các từ (vd: dog -> d o g)
2. Chuyển các từ thành lower case
3. Loại bỏ hết các dấu câu
4. Loại bỏ hết các từ là a hoặc sở hữu cách s
5. Loại bỏ các từ có chứa số (vd: 4, ab4, 4b...)

In [9]:
def clean_description(descriptions):
  # Using maketrans to prepare a translation table for removing punctuation
  table = str.maketrans("", "", string.punctuation)

  for key, desc_list in descriptions.items():
    for i in range(len(desc_list)):
      desc = desc_list[i]

      # Tokenize
      desc = desc.split()
      # convert to lower case
      desc = [word.lower() for word in desc]
      # Remove punctuation from each token
      desc = [w.translate(table) for w in desc]
      # Remove hanging 's' and 'a'
      desc = [word for word in desc if len(word) > 1]
      # Remove tokens with numbers in them
      desc = [word for word in desc if word.isalpha()]
      # store as string
      desc_list[i] = ' '.join(desc)

clean_description(descriptions)

In [10]:
descriptions['1000268201_693b08cb0e']

['child in pink dress is climbing up set of stairs in an entry way',
 'girl going into wooden building',
 'little girl climbing into wooden playhouse',
 'little girl climbing the stairs to her playhouse',
 'little girl in pink dress going into wooden cabin']

## Lưu description (dict chứa image_id và caption của nó) thành file .txt

In [11]:
def save_descriptions(descriptions, filename):
  lines = []
  for key, desc_list in descriptions.items():
    lines += [f"{key} {desc}" for desc in desc_list]

    data = '\n'.join(lines)
  with open(filename, "w") as f:
    f.write(data)
    f.close()

DESCRIPTION_FILENAME = f"{WORKDIR}/Flickr8k/descriptions.txt"
save_descriptions(descriptions, f"{WORKDIR}/Flickr8k/descriptions.txt")

## Load dataset image

In [12]:
# Load image_id từ các file .txt: Flickr_8k.devImages.txt, Flickr_8k.testImages.txt, Flickr_8k.trainImages.txt
def load_set(filename):
  lines = load_doc(filename).split("\n")
  dataset = list()

  for line in lines:
    # skip empty line
    if len(line) < 1:
      continue
    img_id = line.split(".")[0]
    dataset.append(img_id)
  return set(dataset)



In [13]:
all_imgs = glob.glob(IMAGE_FOLDER + "/*.jpg")

all_imgs

['/content/drive/MyDrive/Image_captionning/Flickr8k/Flicker8k_Dataset/3398746625_5199beea71.jpg',
 '/content/drive/MyDrive/Image_captionning/Flickr8k/Flicker8k_Dataset/2635164923_2a774f7854.jpg',
 '/content/drive/MyDrive/Image_captionning/Flickr8k/Flicker8k_Dataset/467858872_f3431df682.jpg',
 '/content/drive/MyDrive/Image_captionning/Flickr8k/Flicker8k_Dataset/412082368_371df946b3.jpg',
 '/content/drive/MyDrive/Image_captionning/Flickr8k/Flicker8k_Dataset/311146855_0b65fdb169.jpg',
 '/content/drive/MyDrive/Image_captionning/Flickr8k/Flicker8k_Dataset/3286198467_8880be127e.jpg',
 '/content/drive/MyDrive/Image_captionning/Flickr8k/Flicker8k_Dataset/1805990081_da9cefe3a5.jpg',
 '/content/drive/MyDrive/Image_captionning/Flickr8k/Flicker8k_Dataset/1835511273_790eaae6e6.jpg',
 '/content/drive/MyDrive/Image_captionning/Flickr8k/Flicker8k_Dataset/96420612_feb18fc6c6.jpg',
 '/content/drive/MyDrive/Image_captionning/Flickr8k/Flicker8k_Dataset/3258874419_23fec1bdc1.jpg',
 '/content/drive/MyDrive/

In [14]:
def get_list_image(img_filename):
  list_img_files = set(load_doc(img_filename).strip().split("\n"))
  list_img = [img for img in all_imgs
              if img[len(IMAGE_FOLDER) + 1 : ] in list_img_files]
  print(f"Found {len(list_img)} images in {img_filename}")

  return list_img

# def load_ids(filename):
#   lines = load_doc(filename).strip().split("\n")
#   lines = [line.split(".")[0] for line in lines if len(line) > 1]

#   return set(lines)

def load_ids(dataset):
  ids = [img[len(IMAGE_FOLDER) + 1: ].split(".")[0]
            for img in dataset]
  return set(ids)

Load train images

In [15]:
train_images_file = f"{WORKDIR}/Flickr8k/Flickr_8k.trainImages.txt"

train_img = get_list_image(train_images_file)
train_ids = load_ids(train_img)


Found 5985 images in /content/drive/MyDrive/Image_captionning/Flickr8k/Flickr_8k.trainImages.txt


Load test images

In [16]:
test_images_file = f"{WORKDIR}/Flickr8k/Flickr_8k.testImages.txt"

test_img = get_list_image(test_images_file)
test_ids = load_ids(test_img)

Found 997 images in /content/drive/MyDrive/Image_captionning/Flickr8k/Flickr_8k.testImages.txt


## Thêm 'startseq' và 'endseq' cho chuỗi

In [17]:
def load_clean_description(filename, dataset):
  doc = load_doc(filename)
  descriptions = dict()

  for line in doc.split("\n"):
    tokens = line.split()

    image_id, image_desc = tokens[0], tokens[1: ]

    # Skip image not in the set
    if image_id in dataset:
      if image_id not in descriptions:
        descriptions[image_id] = []
      descriptions[image_id].append(f"startseq {' '.join(image_desc)} endseq")
  
  return descriptions

# train descirptions
train_descriptions = load_clean_description(DESCRIPTION_FILENAME, train_ids)
print('Descriptions: train=%d' % len(train_descriptions))

Descriptions: train=5985


## Image preprocessing

In [18]:
def preprocess(img_path):
  # Convert images to size 299x299 for feed in inception v3 model
  img = load_img(img_path, target_size=(299, 299))

  # Conver PIL image to numpy
  x = img_to_array(img)

  # Add one more dimension
  # Before : (w, h, c)
  # After : (1, w, h, c)
  x = np.expand_dims(x, axis=0)
  print(x.shape)

  # preprocess the images using preprocess_input() from incpetion module
  x = preprocess_input(x)

  return x


## Load model

In [19]:
model = InceptionV3(weights='imagenet')

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels.h5


In [20]:
model.layers

[<keras.engine.input_layer.InputLayer at 0x7f9a42428b50>,
 <keras.layers.convolutional.conv2d.Conv2D at 0x7f9a422c97b0>,
 <keras.layers.normalization.batch_normalization.BatchNormalization at 0x7f9a422ca650>,
 <keras.layers.core.activation.Activation at 0x7f9a422cab90>,
 <keras.layers.convolutional.conv2d.Conv2D at 0x7f9a422cb610>,
 <keras.layers.normalization.batch_normalization.BatchNormalization at 0x7f9a41d04be0>,
 <keras.layers.core.activation.Activation at 0x7f9a422cbe50>,
 <keras.layers.convolutional.conv2d.Conv2D at 0x7f9a41d06140>,
 <keras.layers.normalization.batch_normalization.BatchNormalization at 0x7f9a41d06da0>,
 <keras.layers.core.activation.Activation at 0x7f9a41d07880>,
 <keras.layers.pooling.max_pooling2d.MaxPooling2D at 0x7f9a41d07130>,
 <keras.layers.convolutional.conv2d.Conv2D at 0x7f9a41d07040>,
 <keras.layers.normalization.batch_normalization.BatchNormalization at 0x7f9a41d11930>,
 <keras.layers.core.activation.Activation at 0x7f9a41d075e0>,
 <keras.layers.convo

In [21]:
model_new = Model(model.input, model.layers[-2].output)

## Image embedding using inception v3

In [22]:
def encode(image):
  image = preprocess(image)
  fea_vec = model_new.predict(image)
  # Reshape from (1,2048) to (2048,)
  fea_vec = np.reshape(fea_vec, fea_vec.shape[1])

  return fea_vec

def path_to_id(pathfile):
  return pathfile[len(IMAGE_FOLDER) + 1: ]

In [23]:

# start = time()
# encoding_train = {}

# for img in train_img:
#   encoding_train[path_to_id(img)] = encode(img)

# print(f"Time taken in second = {time() - start}")

# # Save embedding
# with open(f"{PICKLE_FOLDER}/encoded_train_images.pkl","wb") as encoded_pickle:
#   dump(encoding_train, encoded_pickle)


[1;30;43mKết quả truyền trực tuyến bị cắt bớt đến 5000 dòng cuối.[0m
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 29

{'467858872_f3431df682.jpg': array([0.13652357, 0.84613407, 0.07270911, ..., 0.89153385, 0.9210661 ,
        0.12755296], dtype=float32),
 '412082368_371df946b3.jpg': array([0.9660368 , 0.9704103 , 1.1733086 , ..., 1.2251186 , 1.3467271 ,
        0.45191744], dtype=float32),
 '1805990081_da9cefe3a5.jpg': array([0.20327145, 0.15002584, 0.62739736, ..., 0.06438097, 0.18958785,
        0.00111707], dtype=float32),
 '1835511273_790eaae6e6.jpg': array([2.9284328e-01, 9.6182518e-02, 3.8973600e-01, ..., 5.4154679e-04,
        1.1174066e-01, 7.1021515e-01], dtype=float32),
 '2423550887_ffc9bbcf71.jpg': array([0.11446597, 0.24203339, 0.11410569, ..., 0.15075657, 0.47693205,
        0.366555  ], dtype=float32),
 '3658733605_fbcf570843.jpg': array([0.12000421, 0.8417364 , 0.20137937, ..., 0.18400173, 0.504251  ,
        0.27483287], dtype=float32),
 '3041348852_872c027c16.jpg': array([0.42614076, 0.40578833, 0.13581726, ..., 0.28869703, 0.25943187,
        0.06696425], dtype=float32),
 '293251941

Encode test image

In [30]:
# start = time()
# encoding_test = {}

# for img in test_img:
#   encoding_test[path_to_id(img)] = encode(img)

# print(f"Time taken in second = {time() - start}")

# # Save embedding test
# with open(f"{PICKLE_FOLDER}/encoded_test_images.pkl","wb") as encoded_pickle:
#   dump(encoding_test, encoded_pickle)

(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 3)
(1, 299, 299, 

## Load train features

In [32]:
train_features = load(open(f"{PICKLE_FOLDER}/encoded_train_images.pkl", "rb"))

print(f"Photos: train={len(train_features)}")

Photos: train=5985
