In [1]:
import os
import re
import json
from collections import Counter
import numpy as np
from tqdm import tqdm
from itertools import islice
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     <local_path>
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     <local_path>
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
caption_file = '../data/Flickr8k_text/Flickr8k.token.txt'

with open(caption_file, 'r') as f:
    lines = f.readlines()

captions_dict = {}
for line in lines:
    img_id, caption = line.strip().split('\t')
    img_id = img_id.split('#')[0]
    caption = caption.lower()
    caption = re.sub(r'[^\w\s]', '', caption)

    if img_id not in captions_dict:
        captions_dict[img_id] = []
    captions_dict[img_id].append(caption)


In [3]:
dict(islice(captions_dict.items(), 5))

{'1000268201_693b08cb0e.jpg': ['a child in a pink dress is climbing up a set of stairs in an entry way ',
  'a girl going into a wooden building ',
  'a little girl climbing into a wooden playhouse ',
  'a little girl climbing the stairs to her playhouse ',
  'a little girl in a pink dress going into a wooden cabin '],
 '1001773457_577c3a7d70.jpg': ['a black dog and a spotted dog are fighting',
  'a black dog and a tricolored dog playing with each other on the road ',
  'a black dog and a white dog with brown spots are staring at each other in the street ',
  'two dogs of different breeds looking at each other on the road ',
  'two dogs on pavement moving toward each other '],
 '1002674143_1b742ab4b8.jpg': ['a little girl covered in paint sits in front of a painted rainbow with her hands in a bowl ',
  'a little girl is sitting in front of a large painted rainbow ',
  'a small girl in the grass plays with fingerpaints in front of a white canvas with a rainbow on it ',
  'there is a gir

In [4]:
def tokenize_caption(caption):
    tokens = word_tokenize(caption)
    return ['<start>'] + tokens + ['<end>']

tokenized_captions = {
    k: [tokenize_caption(c) for c in v]
    for k, v in captions_dict.items()
}

In [5]:
dict(islice(tokenized_captions.items(), 5))

{'1000268201_693b08cb0e.jpg': [['<start>',
   'a',
   'child',
   'in',
   'a',
   'pink',
   'dress',
   'is',
   'climbing',
   'up',
   'a',
   'set',
   'of',
   'stairs',
   'in',
   'an',
   'entry',
   'way',
   '<end>'],
  ['<start>',
   'a',
   'girl',
   'going',
   'into',
   'a',
   'wooden',
   'building',
   '<end>'],
  ['<start>',
   'a',
   'little',
   'girl',
   'climbing',
   'into',
   'a',
   'wooden',
   'playhouse',
   '<end>'],
  ['<start>',
   'a',
   'little',
   'girl',
   'climbing',
   'the',
   'stairs',
   'to',
   'her',
   'playhouse',
   '<end>'],
  ['<start>',
   'a',
   'little',
   'girl',
   'in',
   'a',
   'pink',
   'dress',
   'going',
   'into',
   'a',
   'wooden',
   'cabin',
   '<end>']],
 '1001773457_577c3a7d70.jpg': [['<start>',
   'a',
   'black',
   'dog',
   'and',
   'a',
   'spotted',
   'dog',
   'are',
   'fighting',
   '<end>'],
  ['<start>',
   'a',
   'black',
   'dog',
   'and',
   'a',
   'tricolored',
   'dog',
   'playing',


In [6]:
def load_image_list(filepath):
    with open(filepath, 'r') as f:
        return [line.strip() for line in f]

train_images = load_image_list('../data/Flickr8k_text/Flickr_8k.trainImages.txt')
test_images = load_image_list('../data/Flickr8k_text/Flickr_8k.testImages.txt')

In [7]:
all_tokens = []
for caption_list in tokenized_captions.values():
    for tokens in caption_list:
        all_tokens.extend(tokens)

word_counts = Counter(all_tokens)
threshold = 5  
vocab = [word for word, count in word_counts.items() if count >= threshold]

vocab = ['<pad>', '<start>', '<end>', '<unk>'] + sorted(set(vocab))

word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for word, idx in word2idx.items()}
vocab_size = len(word2idx)

print(f"Vocabulary Size: {vocab_size}")


Vocabulary Size: 2995


In [8]:
vocab[:10]

['<pad>', '<start>', '<end>', '<unk>', '2', '23', '3', '4', '5', '6']

In [9]:
word2idx['<pad>']

0

In [14]:
idx2word[13]

'a'

In [11]:
def encode_caption(tokens, word2idx, max_len=20):
    encoded = [word2idx.get(w, word2idx['<unk>']) for w in tokens]
    encoded = [idx if idx < vocab_size else word2idx['<unk>'] for idx in encoded]
    if len(encoded) < max_len:
        encoded += [word2idx['<pad>']] * (max_len - len(encoded))
    else:
        encoded = encoded[:max_len]
    return encoded

max_len = 20

encoded_captions = {}
for img_id, caption_list in tokenized_captions.items():
    encoded_captions[img_id] = [
        encode_caption(tokens, word2idx, max_len)
        for tokens in caption_list
    ]


In [12]:
dict(islice(encoded_captions.items(), 5))

{'1000268201_693b08cb0e.jpg': [[12,
   13,
   503,
   1285,
   13,
   1882,
   775,
   1318,
   537,
   2828,
   13,
   2227,
   1702,
   2475,
   1285,
   61,
   3,
   2909,
   11,
   0],
  [12,
   13,
   1062,
   1083,
   1316,
   13,
   2964,
   361,
   11,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  [12,
   13,
   1486,
   1062,
   537,
   1316,
   13,
   2964,
   1913,
   11,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  [12,
   13,
   1486,
   1062,
   537,
   2664,
   2475,
   2706,
   1205,
   1913,
   11,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  [12,
   13,
   1486,
   1062,
   1285,
   13,
   1882,
   775,
   1083,
   1316,
   13,
   2964,
   3,
   11,
   0,
   0,
   0,
   0,
   0,
   0]],
 '1001773457_577c3a7d70.jpg': [[12,
   13,
   257,
   751,
   62,
   13,
   2450,
   751,
   80,
   922,
   11,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  [12,
   13,
   257,
   751,
   62,
   13,
   2765,
   751,
   19

In [13]:
with open('../data/flickr_vocab.json', 'w') as f:
    json.dump({'word2idx': word2idx, 'idx2word': idx2word}, f)

with open('../data/flickr_encoded_captions.json', 'w') as f:
    json.dump(encoded_captions, f)

json.dump(train_images, open('../data/flickr_train_images.json', 'w'))
json.dump(test_images, open('../data/flickr_test_images.json', 'w'))
