In [2]:
import re
import json
import collections
from constants import START_TOKEN, END_TOKEN

In [3]:
def get_train_image_captions_mapping():
    """
    Returns a dictionary mapping image path to the list of captions
    :return: dict {image_path: [caption1, caption2, ...]}
    """
    try:
        result = {}
        with open('data/annotations/captions_train2014.json', 'r') as f:
            annotations = json.load(f)
            for val in annotations['annotations']:
                caption = f"{val['caption']}"
                image_path = '/data/train/' + 'COCO_train2014_' + '%012d.jpg' % (val['image_id'])
                result.setdefault(image_path, []).append(caption)
        return result
    except FileNotFoundError:
        print("Don't have the annotations file. Please run data_download.py to download the data.")

In [5]:
def clean_bad_text_data(image_captions_mapping):
    """
    Removes punctuation and numbers, conver to lower from the captions
    """
    for path, captions in image_captions_mapping.items():
        for i in range(len(captions)):
            cap = captions[i]
            cap = cap.split()

            # convert to lower case
            cap = [w.lower() for w in cap if w]

            # remove punctuation from each token
            cap = [re.sub(r'[^\w\s]','',w) for w in cap]

            # remove tokens with numbers in them
            cap = [w for w in cap if w.isalpha()]

            captions[i] =  ' '.join(cap)
            
    return image_captions_mapping

In [6]:
a = get_train_image_captions_mapping()
clean_bad_text_data(a)

{'/data/train/COCO_train2014_000000318556.jpg': ['a very clean and well decorated empty bathroom',
  'a blue and white bathroom with butterfly themed wall tiles',
  'a bathroom with a border of butterflies and blue paint on the walls above it',
  'an angled view of a beautifully decorated bathroom',
  'a clock that blends in with the wall hangs in a bathroom'],
 '/data/train/COCO_train2014_000000116100.jpg': ['a panoramic view of a kitchen and all of its appliances',
  'a panoramic photo of a kitchen and dining room',
  'a wide angle view of the kitchen work area',
  'multiple photos of a brown and white kitchen',
  'a kitchen that has a checkered patterned floor and white cabinets'],
 '/data/train/COCO_train2014_000000379340.jpg': ['a graffitied stop sign across the street from a red car',
  'a vandalized stop sign and a red beetle on the road',
  'a red stop sign with a bush bumper sticker under the word stop',
  'a stop sign that has been vandalized is pictured in front of a parked 

In [1]:
def add_end_start_tokens(image_captions_mapping):
    result = {}
    for key in image_captions_mapping:
        for i in range(len(image_captions_mapping[key])):
            result.setdefault(key, []).append(f'{START_TOKEN} {image_captions_mapping[key][i]} {END_TOKEN}')
    return result

In [8]:
b = add_end_start_tokens(a)
b

{'/data/train/COCO_train2014_000000318556.jpg': ['<start> a very clean and well decorated empty bathroom <end>',
  '<start> a blue and white bathroom with butterfly themed wall tiles <end>',
  '<start> a bathroom with a border of butterflies and blue paint on the walls above it <end>',
  '<start> an angled view of a beautifully decorated bathroom <end>',
  '<start> a clock that blends in with the wall hangs in a bathroom <end>'],
 '/data/train/COCO_train2014_000000116100.jpg': ['<start> a panoramic view of a kitchen and all of its appliances <end>',
  '<start> a panoramic photo of a kitchen and dining room <end>',
  '<start> a wide angle view of the kitchen work area <end>',
  '<start> multiple photos of a brown and white kitchen <end>',
  '<start> a kitchen that has a checkered patterned floor and white cabinets <end>'],
 '/data/train/COCO_train2014_000000379340.jpg': ['<start> a graffitied stop sign across the street from a red car <end>',
  '<start> a vandalized stop sign and a red 

In [10]:
def create_vocab(mapping, word_count_threshold = 10):
    # Create list of captions
    all_captions = []
    for captions in mapping.values():
        all_captions.extend(captions)

    # Allow only words which appear at least 10 times
    word_counts = {}
    nsents = 0
    for sent in all_captions:
        nsents += 1
        for w in sent.split(' '):
            word_counts[w] = word_counts.get(w, 0) + 1

    vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]

    return vocab

In [12]:
vocab = create_vocab(b)
vocab

['<start>',
 'a',
 'very',
 'clean',
 'and',
 'well',
 'decorated',
 'empty',
 'bathroom',
 '<end>',
 'blue',
 'white',
 'with',
 'butterfly',
 'themed',
 'wall',
 'tiles',
 'border',
 'of',
 'butterflies',
 'paint',
 'on',
 'the',
 'walls',
 'above',
 'it',
 'an',
 'angled',
 'view',
 'beautifully',
 'clock',
 'that',
 'in',
 'hangs',
 'panoramic',
 'kitchen',
 'all',
 'its',
 'appliances',
 'photo',
 'dining',
 'room',
 'wide',
 'angle',
 'work',
 'area',
 'multiple',
 'photos',
 'brown',
 'has',
 'checkered',
 'patterned',
 'floor',
 'cabinets',
 'graffitied',
 'stop',
 'sign',
 'across',
 'street',
 'from',
 'red',
 'car',
 'vandalized',
 'road',
 'bush',
 'bumper',
 'sticker',
 'under',
 'word',
 'been',
 'is',
 'pictured',
 'front',
 'parked',
 'modified',
 'to',
 'read',
 'two',
 'people',
 'are',
 'walking',
 'down',
 'beach',
 'carrying',
 'surf',
 'boards',
 'teenagers',
 'at',
 'surfboards',
 'couple',
 'their',
 'guy',
 'girl',
 'holding',
 'sink',
 'toilet',
 'inside',
 's

In [13]:
import keras
import pickle
from keras.preprocessing import image
oov_token = '<UNK>'
filters = '!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n' # to make sure all the last non digit non alphabet chars are removed
tokenizer = keras.preprocessing.text.Tokenizer(filters = filters, oov_token=oov_token)
tokenizer.fit_on_texts(vocab)
vocab_size = len(tokenizer.word_index) + 1
print('vocab_size :', vocab_size)
pickle.dump(tokenizer, open('process_data/word_tokenize.pkl', 'wb'))

vocab_size : 6306


In [14]:
# RUN THISSSSSS
with open('process_data/word_tokenize.pkl','rb') as f:
    tokenizer = pickle.load(f)  

vocab_size = len(tokenizer.word_index) + 1
vocab_size

6306