In [1]:
import tensorflow as tf
tf.enable_eager_execution()

import json
import pickle
import os
import pandas as pd
import numpy as np

# Scikit-learn includes many helpful utilities
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Prepare Data
1. Load capions, image paths, image ids from json file
2. Data Shuffling
3. Prepare image: resize image, convert pixels to digits
4. Prepare annotations

<h3>1. Load capions, image paths, image ids from json file</h3>

In [2]:
# annotation_zip = tf.keras.utils.get_file('captions.zip', 
#                                           cache_subdir=os.path.abspath('.'),
#                                           origin = 'http://images.cocodataset.org/annotations/annotations_trainval2014.zip',
#                                           extract = True)
# annotation_file = os.path.dirname(annotation_zip)+'/annotations/captions_train2014.json'

# name_of_zip = 'train2014.zip'
# if not os.path.exists(os.path.abspath('.') + '/' + name_of_zip):
#     image_zip = tf.keras.utils.get_file(name_of_zip, 
#                                       cache_subdir=os.path.abspath('.'),
#                                       origin = 'http://images.cocodataset.org/zips/train2014.zip',
#                                       extract = True)
#     PATH = os.path.dirname(image_zip)+'/train2014/'
# else:
#     PATH = os.path.abspath('.')+'/train2014/'

annotation_file = 'captions_train2014.json'
images_path = './train2014/'

In [3]:
with open(annotation_file, 'r') as f:
    annotations = json.load(f)

In [4]:
all_captions=[]
img_name_vector =[]
image_ids = []
for annot in annotations['annotations']:
    caption = '<start> ' + annot['caption'] + ' <end>'
    image_id = annot['image_id']
    full_coco_image_path = PATH + 'COCO_train2014_' + '%012d.jpg' % (image_id)
    
    img_name_vector.append(full_coco_image_path)
    all_captions.append(caption)
    image_ids.append(image_id)

In [9]:
# Find annotations dataframe, if it doesn't exist, create one. If it exists, read the dataframe.
# if not os.path.exists('annotations.csv'):
#     # create a dataframe to store image ids, image file paths and captions
#     annotations = pd.DataFrame({'image_id': image_ids,
#                             'image_file': img_name_vector,
#                             'caption': all_captions})
#     # Save the dataframe as csv
#     annotations.to_csv("annotations.csv")
    
# else:
#     annotations = pd.read_csv("annotations.csv")
#     captions = annotations['caption'].values
#     image_ids = annotations['image_id'].values
#     image_files = annotations['image_file'].values

<h3>2. Data Shuffling</h3>

In [5]:
train_captions, img_name_vector, image_ids = shuffle(all_captions,
                                          img_name_vector,
                                          image_ids, 
                                          random_state=1)

<h3>3. Prepare image: resize images, convert pixels to digits</h3>
<p>Two methods: one would output tensor data type, the other would generate numpy array.</p>

In [7]:
###------ Method 1 ------ ##

In [5]:
# resize pictures by tensorflow, ouput would be 'tf.tensor'
def tf_load_img(image_path):
    img = tf.read_file(image_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize_images(img, (224, 224))
    img = tf.image.per_image_standardization(img)
#     img = tf.keras.applications.vgg16.preprocess_input(img)
#     img = tf.keras.applications.inception_v3.preprocess_input(img)
    return img, image_path

In [9]:
###------ Method 2 ------ ##

In [13]:
# resize pictures by Keras, output would be 'numpy array'

def keras_load_img(img_path):
    img = load_img(img_path, target_size = (224,224))
    # convert the image pixels to a numpy array
    img = img_to_array(img)
    # reshape data for the model
    img = img.reshape((1, img.shape[0], img.shape[1], img.shape[2]))
    # prepare the image for the VGG model
#     img = tf.keras.applications.inception_v3.preprocess_input(img)
    img + preprocess_input(img)
    # get features
    feature = image_features_extract_model.predict(img, verbose=0)
    return img, img_path

In [7]:
# tf_load_img('loverna-journey-1053456-unsplash.jpg')

In [7]:
image_model = tf.keras.applications.vgg16.VGG16(include_top=False, 
                                                weights='imagenet')

# image_model = tf.keras.applications.InceptionV3(include_top=False, 
#                                                 weights='imagenet')
new_input = image_model.input
hidden_layer = image_model.layers[-1].output

image_features_extract_model = tf.keras.Model(new_input, hidden_layer)


In [None]:
# getting the unique images
encode_train = sorted(set(img_name_vector))

# feel free to change the batch_size according to your system configuration
image_dataset = tf.data.Dataset.from_tensor_slices(
                                encode_train).map(tf_load_img).batch(16)

for img, path in image_dataset:
    batch_features = image_features_extract_model(img)
    batch_features = tf.reshape(batch_features, [batch_features.shape[0], -1, batch_features.shape[3]])

    for bf, p in zip(batch_features, path):
        path_of_feature = p.numpy().decode("utf-8")
        np.save(path_of_feature, bf.numpy())

<h3>4. Prepare annotations</h3>
* Step 1. 分詞，為詞彙編號碼
* Step 2. 限制字彙數量以5000為上限，而5000以外的字彙則以"UNK"(for unknown)去替代
* Step 3. 將文字與號碼配對 (create a word --> index mapping)
* Step 4. pad_sequence() 讓每個captions的長度相同
* Step 5. Create word2idx dict (vocabulary size < 5000)
* Step 6. Create inx2word dict (vocabulary size < 5000)
* Step 7. Calculate TF-IDF
* Step 8. Build vocabulary


<h4>keras Tokenizer API</h4>
* word_counts: A dictionary of words and their counts.
* word_docs: A dictionary of words and how many documents each appeared in.
* word_index: A dictionary of words and their uniquely assigned integers.
* document_count:An integer count of the total number of documents that were used to fit the Tokenizer.

In [11]:
## --- Step 1~2 --- ##

# choosing the top 5000 words from the vocabulary
top_k = 5000

# 建立分詞器 tokenizer
# oov_token指定的string，會被用來取代不在字典中的詞彙
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=top_k, 
                                                  oov_token="<unk>", 
                                                  filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ')
## --- Step 3 --- ##
# Updates internal vocabulary based on a list of texts. 
# 將每個字編號，越常出現的字彙，編號越小，越不常出現的字彙，編號越大
tokenizer.fit_on_texts(train_captions)

# 利用 texts_to_sequence將文本中的句子，每個字都轉換成'對應的'整數編號 (word2idx)
# Transforms each text in texts in a sequence of integers.
train_seqs = tokenizer.texts_to_sequences(train_captions)

## --- Step 4 --- ##
# padding each vector to the max_length of the captions 令每一個caption的長度相同
# if the max_length parameter is not provided, pad_sequences calculates that automatically
# 若沒有設定maxlen，則程式會自動計算，此處設maxlen為20
cap_padded = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post', maxlen=20)

## --- Step 5 --- ##
# 創建詞彙數為5000的word2inx
tokenizer.word_index = {key:value for key, value in tokenizer.word_index.items() if value<5000}
# putting <unk> token in the word2idx dictionary 把<unk> token加入字典，編號為5001
tokenizer.word_index[tokenizer.oov_token] = top_k + 1
# <pad>token的編號為0
tokenizer.word_index['<pad>'] = 0

## --- Step 6 --- ##
# creating a reverse mapping (index -> word) 
"""
上面的 tokenizer.word_index 為dict，key為字彙、value為編號
而index_word的key為編號、value為字彙 (idx2word)
"""
index_word = {value:key for key, value in tokenizer.word_index.items()}

## --- Step 7 --- ##
# create the object of tfid vectorizer 創建專門計算 TF-IDF的object
tfid_vectorizer = TfidfVectorizer('english')
# 將資料送進TF-IDF計算器
tfid_vectorizer.fit(train_captions)

## --- Step 8 --- ##
# collect the vocabulary items used in the vectorizer
# 創建dictionary (包含所有的字彙)
dictionary = tfid_vectorizer.vocabulary_.items()
vocabulary = {key:value for key, value in dictionary if value < 5000}

Use following function to calculate maximun length of captions, if needed.

In [12]:
# This will find the maximum length of any caption in our dataset
def calc_max_length(tensor):
    return max(len(t) for t in tensor)

In [13]:
# calculating the max_length 計算max_length
# used to store the attention weights 用來儲存attention的權重值
max_length = calc_max_length(train_seqs)

* Step 9. Prepare masks which is used to erase the padding part of a sentence when training.

In [14]:
masks = []
for caption in train_seqs:
    current_num_words = len(caption)
    current_masks = np.zeros(20) # max_caption_length = 20
    current_masks[:current_num_words] = 1.0
    masks.append(current_masks)

# data = {'word_idxs': cap_padded, 'masks': masks}
# np.save('data.npy', data)

In [16]:
dataset = tf.data.Dataset.from_tensor_slices((img_name_vector, cap_padded, np.array(masks), image_ids))

def map_func(img_name, cap_padded, masks):
    img_tensor = np.load(img_name.decode('utf-8')+'.npy')
    return img_tensor, cap_padded, masks, image_ids

dataset.map(lambda item1, item2, item3, item4: tf.py_func(
    map_func, [item1, item2, item3, item4],
    [tf.float32, tf.int32, tf.float64, tf.int64], 
    ),num_parallel_calls=2)

# shuffling and batching
dataset = dataset.shuffle(10000) #BUFFER_SIZE = 10000
dataset = dataset.repeat(5)
dataset = dataset.batch(64)
dataset = dataset.prefetch(1)
