In [1]:
import pickle
import collections
import numpy as np

In [2]:
with open('./Caption_description.pkl','rb') as f:
    Caption_dict = pickle.load(f)

`Build A vocab with frequent words [threshold > 10]`

- Create a vocab with unique words
- Create a list with all words.
- Extract those word, frequency/threshold > 10

In [3]:
# Create a set of Vocabulary and a list of all words in Caption_dict
vocab = set()
total_word = []
for key in Caption_dict.keys():
    [vocab.update(sentence.split()) for sentence in Caption_dict[key]]
    [total_word.append(word) for sentence in Caption_dict[key] for word in sentence.split()]
    
print(f'No. of unique words: {len(vocab)}, No. of Total word: {len(total_word)}')

No. of unique words: 8424, No. of Total word: 373837


In [4]:
# Exract words which have word frequency >10
threshold = 10
counter = collections.Counter(total_word)
freq_count = dict(counter)

extracted_words = [item[0] for item in freq_count.items() if item[1] >= threshold]
print(len(extracted_words))

1956


Import `Flickr_8k.trainImages.txt` File:

In [5]:
with open('../Dataset/Flickr8k/Flickr_TextData/Flickr_8k.trainImages.txt', encoding = 'utf-8') as f:
    trainfile = f.readlines()

In [6]:
train = [image_name.split('\n')[0] for image_name in trainfile]

In [7]:
# create a dict with train captions

train_caption = {}

for image_name in train:
    train_caption[image_name] = []
    for caption in Caption_dict[image_name]:
        caption = '<start>' + caption + '<end>'
        
        train_caption[image_name].append(caption)

In [8]:
len(train_caption)

6000

In [9]:
# create a word to index dictionary

word_to_idx = {}
idx_to_word = {}

for i,word in enumerate(extracted_words):
    word_to_idx[word] = i+1
    idx_to_word[i+1] = word
    
length = len(word_to_idx)

word_to_idx['<start>'] = length + 1
word_to_idx['<end>'] = length + 2

idx_to_word[length + 1] = '<start>'
idx_to_word[length + 2] = '<end>'

In [10]:
with open('Word_to_index.pkl.pkl','wb') as f:
    pickle.dump(word_to_idx,f)
    
with open('Index_to_word.pkl','wb') as f:
    pickle.dump(idx_to_word,f)

In [11]:
# Find max lengh caption

max_length = 0
for img_name in train_caption.keys():
    for caption in train_caption[img_name]:
        
        max_length = max(max_length,len(caption.split()))
        
print(max_length)

33


Build `Embedding Matrix`

We will map the every word(index) to a `50d vector` and for this purpose, we will use a pre-trained `GLOVE Model`:

In [12]:
embedding_idx = {}

with open('../Dataset/glove.6B/glove.6B.50d.txt',encoding='utf-8') as f:
    for line in f:
        values =line.split()
        word = values[0]
        word_embedding = np.array(values[1:],dtype='float')
        embedding_idx[word] = word_embedding

In [13]:
def get_embedding_matrix():
    emb_dim = 50
    vocab_size = len(word_to_idx)+1
    matrix = np.zeros((vocab_size,emb_dim))
    
    for word,idx in word_to_idx.items():
        emb_vector = embedding_idx.get(word)
        
        if emb_vector is not None:
            matrix[idx] = emb_vector
            
    return matrix

In [14]:
emb_mat = get_embedding_matrix()

In [15]:
np.save('embedding_matrix.npy',emb_mat)