In [3]:
import json
import pickle
import os
import pandas as pd
import numpy as np

import tensorflow as tf

# Scikit-learn includes many helpful utilities
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

  from ._conv import register_converters as _register_converters


# Prepare Data
1. Load capions, image paths, image ids from json file
2. Data Shuffling
3. Prepare image: resize image, convert pixels to digits
4. Prepare annotations

<h3>1. Load capions, image paths, image ids from json file</h3>

In [7]:
annotation_file = './coco/train/captions_train2014.json'
images_path = './coco/train/images/'

In [8]:
with open(annotation_file, 'r') as f:
    annotations = json.load(f)

In [9]:
all_captions=[]
img_name_vector =[]
image_ids = []
for annot in annotations['annotations']:
    caption = '<start> ' + annot['caption'] + ' <end>'
    image_id = annot['image_id']
    full_coco_image_path = images_path + '%012d.jpg' % (image_id)
    
    img_name_vector.append(full_coco_image_path)
    all_captions.append(caption)
    image_ids.append(image_id)

In [10]:
# Find annotations dataframe, if it doesn't exist, create one. If it exists, read the dataframe.
if not os.path.exists('annotations.csv'):
    # create a dataframe to store image ids, image file paths and captions
    annotations = pd.DataFrame({'image_id': image_ids,
                            'image_file': img_name_vector,
                            'caption': all_captions})
    # Save the dataframe as csv
    annotations.to_csv("annotations.csv")
    
else:
    annotations = pd.read_csv("annotations.csv")
    captions = annotations['caption'].values
    image_ids = annotations['image_id'].values
    image_files = annotations['image_file'].values

<h3>2. Data Shuffling</h3>

In [11]:
train_captions, img_name_vector, image_ids = shuffle(all_captions,
                                          img_name_vector,
                                          image_ids, 
                                          random_state=1)

<h3>3. Prepare image: resize images, convert pixels to digits</h3>
<p>Two methods: one would output tensor data type, the other would generate numpy array.</p>

In [12]:
###------ Method 1 ------ ##

In [13]:
# resize pictures by tensorflow, ouput would be 'tf.tensor'
def tf_load_img(img_path):
    img = tf.read_file(img_path)
    img = tf.image.decode_jpeg(img, channels = 3)
    img = tf.image.resize_images(img,(224,224))
#     img = tf.keras.applications.inception_v3.preprocess_input(img)
    return img, img_path

In [14]:
###------ Method 2 ------ ##

In [15]:
# resize pictures by Keras, output would be 'numpy array'
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input

def keras_load_img(img_path):
    img = load_img(img_path, target_size = (224,224))
    # convert the image pixels to a numpy array
    img = img_to_array(img)
    # reshape data for the model
    img = img.reshape((1, img.shape[0], img.shape[1], img.shape[2]))
    # prepare the image for the VGG model
    img = preprocess_input(img)
    # get features
#     feature = model.predict(img, verbose=0)
    return img

Using TensorFlow backend.


<h3>4. Prepare annotations</h3>
* Step 1. 分詞，為詞彙編號碼
* Step 2. 限制字彙數量以5000為上限，而5000以外的字彙則以"UNK"(for unknown)去替代
* Step 3. 將文字與號碼配對 (create a word --> index mapping)
* Step 4. pad_sequence() 讓每個captions的長度相同
* Step 5. Create word2idx dict (vocabulary size < 5000)
* Step 6. Create inx2word dict (vocabulary size < 5000)
* Step 7. Calculate TF-IDF

<h4>keras Tokenizer API</h4>
* word_counts: A dictionary of words and their counts.
* word_docs: A dictionary of words and how many documents each appeared in.
* word_index: A dictionary of words and their uniquely assigned integers.
* document_count:An integer count of the total number of documents that were used to fit the Tokenizer.

In [18]:
## --- Step 1~2 --- ##

# choosing the top 5000 words from the vocabulary
top_k = 5000

# 建立分詞器 tokenizer
# oov_token指定的string，會被用來取代不在字典中的詞彙
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=top_k, 
                                                  oov_token="<unk>", 
                                                  filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ')
## --- Step 3 --- ##
# Updates internal vocabulary based on a list of texts. 
# 將每個字編號，越常出現的字彙，編號越小，越不常出現的字彙，編號越大
tokenizer.fit_on_texts(train_captions)

# 利用 texts_to_sequence將文本中的句子，每個字都轉換成'對應的'整數編號 (word2idx)
# Transforms each text in texts in a sequence of integers.
train_seqs = tokenizer.texts_to_sequences(train_captions)

## --- Step 4 --- ##
# padding each vector to the max_length of the captions 令每一個caption的長度相同
# if the max_length parameter is not provided, pad_sequences calculates that automatically
# 若沒有設定maxlen，則程式會自動計算，此處設maxlen為20
cap_padded = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post', maxlen=20)

## --- Step 5 --- ##
# 創建詞彙數為5000的word2inx
tokenizer.word_index = {key:value for key, value in tokenizer.word_index.items() if value<5000}
# putting <unk> token in the word2idx dictionary 把<unk> token加入字典，編號為5001
tokenizer.word_index[tokenizer.oov_token] = top_k + 1
# <pad>token的編號為0
tokenizer.word_index['<pad>'] = 0

## --- Step 6 --- ##
# creating a reverse mapping (index -> word) 
"""
上面的 tokenizer.word_index 為dict，key為字彙、value為編號
而index_word的key為編號、value為字彙 (idx2word)
"""
index_word = {value:key for key, value in tokenizer.word_index.items()}

## --- Step 7 --- ##
# create the object of tfid vectorizer 創建專門計算 TF-IDF的object
tfid_vectorizer = TfidfVectorizer('english')
# 將資料送進TF-IDF計算器
tfid_vectorizer.fit(train_captions)

## --- Step 8 --- ##
# collect the vocabulary items used in the vectorizer
# 創建dictionary (包含所有的字彙)
dictionary = tfid_vectorizer.vocabulary_.items()
vocabulary = {key:value for key, value in dictionary if value < 5000}

In [20]:
# This will find the maximum length of any caption in our dataset
def calc_max_length(tensor):
    return max(len(t) for t in tensor)

In [21]:
# calculating the max_length 計算max_length
# used to store the attention weights 用來儲存attention的權重值
max_length = calc_max_length(train_seqs)

In [24]:
index_word

{1: 'a',
 2: '<start>',
 3: '<end>',
 4: 'on',
 5: 'of',
 6: 'the',
 7: 'in',
 8: 'with',
 9: 'and',
 10: 'is',
 11: 'man',
 12: 'to',
 13: 'sitting',
 14: 'an',
 15: 'two',
 16: 'standing',
 17: 'at',
 18: 'people',
 19: 'are',
 20: 'next',
 21: 'white',
 22: 'woman',
 23: 'table',
 24: 'street',
 25: 'that',
 26: 'holding',
 27: 'person',
 28: 'large',
 29: 'some',
 30: 'it',
 31: 'down',
 32: 'top',
 33: 'group',
 34: 'up',
 35: 'field',
 36: 'small',
 37: 'near',
 38: 'tennis',
 39: 'his',
 40: 'front',
 41: 'black',
 42: 'plate',
 43: 'room',
 44: 'train',
 45: 'riding',
 46: 'dog',
 47: 'red',
 48: 'by',
 49: 'young',
 50: 'cat',
 51: 'water',
 52: 'baseball',
 53: 'has',
 54: 'while',
 55: 'walking',
 56: 'playing',
 57: 'bathroom',
 58: 'sign',
 59: 'blue',
 60: 'food',
 61: 'kitchen',
 62: 'grass',
 63: 'bus',
 64: 'there',
 65: 'pizza',
 66: 'parked',
 67: 'green',
 68: 'side',
 69: 'other',
 70: 'snow',
 71: 'looking',
 72: 'building',
 73: 'bed',
 74: 'ball',
 75: 'beach',


In [23]:
# Create masks which is used to erase the padding part of a sentence when training.
masks = []
for caption in train_seqs:
    current_num_words = len(caption)
    current_masks = np.zeros(20) # max_caption_length = 20
    current_masks[:current_num_words] = 1.0
    masks.append(current_masks)

data = {'word_idxs': cap_padded, 'masks': masks}
np.save('data.npy', data)

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((img_name_vector, cap_padded, image_ids, masks))

def map_func(img_name_vector, cap_padded ,image_ids, masks):
    img_tensor = np.load(img_name.decode('utf-8')+'.npy')
    return img_tensor, cap_padded, image_ids, masks

dataset.map(lambda item1, item2 ,item3, item4: tf.py_func(
    map_func, [item1, item2 ,item3, item4],
    [tf.float32, tf.int32, tf.float32, tf.int32], 
    num_parallel_calls = 2))

# shuffling and batching
dataset = dataset.shuffle(10000) #BUFFER_SIZE = 10000
dataset = dataset.repeat(5)
dataset = dataset.batch(64)
dataset = dataset.prefetch(1)
