In [6]:
import tensorflow as tf
import keras
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
from PIL import Image
import os
from tqdm import tqdm

import nltk

from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence
from caption_utils import load_image, calc_max_length, plot_attention, cache_bottlenecks

In [2]:
# Setting dirs
TRAIN_INPUT = "../Segmentation/COCO/train_input.npy"
TRAIN_LABEL = "../Segmentation/COCO/train_label.npy"

TEST_INPUT = "../Segmentation/COCO/val_input.npy"
TEST_LABEL = "../Segmentation/COCO/val_label.npy"

CKPT_DIR= "log/imgCaption"
PLOT_DIR = "log/plot"

# Load Train sets
img_name_vector = np.load(TRAIN_INPUT)
train_captions = np.load(TRAIN_LABEL)


# Load Test sets
test_img_name_vector = np.load(TEST_INPUT)
test_captions = np.load(TEST_LABEL)

In [4]:
# Setting Hyperprams
# 학습을 위한 설정값들을 지정합니다.
BATCH_SIZE = 64
BUFFER_SIZE = 1000
embedding_dim = 256
units = 512
top_k = 15000
vocab_size = top_k + 1
attention_features_shape = 64
EPOCHS = 20

In [7]:
# Load Caption data and then preprocess
# 가장 빈도수가 높은 15000개의 단어를 선택해서 Vocabulary set을 만들고,
# Vocabulary set에 속하지 않은 단어들은 <unk>로 지정합니다.
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=top_k,
                                                  oov_token="<unk>",
                                                  filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ')

tokenizer.fit_on_texts(train_captions)
# 가장 긴 문장보다 작은 문장들은 나머지 부분은 <pad>로 padding합니다.
tokenizer.word_index['<pad>'] = 0
tokenizer.index_word[0] = '<pad>'

# caption 문장을 띄어쓰기 단위로 split해서 tokenize 합니다.
train_seqs = tokenizer.texts_to_sequences(train_captions)
# 길이가 짧은 문장들에 대한 padding을 진행합니다.
cap_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post')
# attetion weights를 위해서 가장 긴 문장의 길이를 저장합니다.
max_length = calc_max_length(train_seqs)

In [37]:
tokenizer.index_word

{1: '<unk>',
 2: 'a',
 3: '<start>',
 4: '<end>',
 5: 'on',
 6: 'of',
 7: 'the',
 8: 'in',
 9: 'with',
 10: 'and',
 11: 'is',
 12: 'man',
 13: 'to',
 14: 'sitting',
 15: 'an',
 16: 'two',
 17: 'at',
 18: 'standing',
 19: 'people',
 20: 'are',
 21: 'next',
 22: 'white',
 23: 'woman',
 24: 'street',
 25: 'table',
 26: 'holding',
 27: 'that',
 28: 'large',
 29: 'it',
 30: 'person',
 31: 'some',
 32: 'down',
 33: 'top',
 34: 'group',
 35: 'up',
 36: 'field',
 37: 'small',
 38: 'tennis',
 39: 'near',
 40: 'front',
 41: 'his',
 42: 'black',
 43: 'train',
 44: 'plate',
 45: 'room',
 46: 'dog',
 47: 'riding',
 48: 'red',
 49: 'cat',
 50: 'young',
 51: 'by',
 52: 'water',
 53: 'baseball',
 54: 'has',
 55: 'while',
 56: 'playing',
 57: 'walking',
 58: 'bathroom',
 59: 'sign',
 60: 'blue',
 61: 'kitchen',
 62: 'food',
 63: 'grass',
 64: 'there',
 65: 'bus',
 66: 'green',
 67: 'pizza',
 68: 'parked',
 69: 'side',
 70: 'building',
 71: 'other',
 72: 'bed',
 73: 'looking',
 74: 'snow',
 75: 'beach',

In [20]:
train_captions[0]

'<start> A man holding a device while walking down a sidewalk. <end>'

In [21]:
train_seqs[0]

[3, 2, 12, 26, 2, 949, 55, 57, 32, 2, 192, 4]

In [22]:
cap_vector[0]

array([  3,   2,  12,  26,   2, 949,  55,  57,  32,   2, 192,   4,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
      dtype=int32)

In [24]:
sample=['a man holding a device while walking down a sidewalk']

sequences = tokenizer.texts_to_sequences(sample)
print(sequences)

[[2, 12, 26, 2, 949, 55, 57, 32, 2, 192]]


In [43]:
from keras.preprocessing.text import text_to_word_sequence
text = '<start> A man holding a device while walking down a sidewalk. <end>'
result = text_to_word_sequence(text)
print(result)

['start', 'a', 'man', 'holding', 'a', 'device', 'while', 'walking', 'down', 'a', 'sidewalk', 'end']


In [45]:
print(['<start>', 'a', 'man', 'holding', 'a', 'device', 'while', 'walking', 'down', 'a', 'sidewalk', '<end>'])

['<start>', 'a', 'man', 'holding', 'a', 'device', 'while', 'walking', 'down', 'a', 'sidewalk', '<end>']
