## Miscellanous   
This notebook contains all the intermediate steps and calculations during modelling.

In [2]:
from sklearn.utils import shuffle
import string
import tensorflow as tf
import numpy as np
import pickle

### Create Training and Dev image sets

In [14]:
# Read the file containing all caption-image pairs
with open('Dataset/Flickr30k/Flickr30k.token.txt', 'r') as file:
    annotations = file.read()

In [15]:
# get path to the image folder
PATH = os.path.abspath('.') + '/Dataset/Flickr30k/flickr30k-images/'

# Store captions and image names in vectors
all_captions = []
all_img_name_vector = []

# splitting the file contents by line
for annot in annotations.split("\n"):
        # Skip empty lines
        if len(annot)<1:
            continue
        # separate out the caption from the line
        caption = annot.split()[1:]
        # add <start> and <end> token to the caption
        caption = "<start> " + ' '.join(caption) + " <end>"
        # separate out the image id from line)
        image_id = annot.split()[0]
        # remove caption number
        image_id = image_id.split('#')[0]
        # convert image id into the image path
        full_image_path = PATH + image_id

        all_img_name_vector.append(full_image_path)
        all_captions.append(caption)

In [16]:
len(all_captions), all_img_name_vector[0], all_captions[0]

(158915,
 '/home/shailesh/Projects/mytf2/Flickr30k_notebooks/Dataset/Flickr30k/flickr30k-images/1000092795.jpg',
 '<start> Two young guys with shaggy hair look at their hands while hanging out in the yard . <end>')

In [17]:
# Divide images into dev and train set
encode_train = sorted(set(all_img_name_vector))

encode_train = shuffle(encode_train, random_state=7)

In [18]:
# last 2k images be dev set
len(encode_train)-2000

29783

In [19]:
train_file = open("Flickr_30k.trainImages.txt", 'w')
for id in encode_train[:29783]:
    id = id.split('/')[-1]
    train_file.write(id+'\n')

train_file.close()

In [20]:
dev_file = open("Flickr_30k.devImages.txt", 'w')
for id in encode_train[29783:]:
    id = id.split('/')[-1]
    dev_file.write(id+'\n')

dev_file.close()

### Vocab Size Determination

In [21]:
# Read the file containing all caption-image pairs
with open('Dataset/Flickr30k/Flickr30k.token.txt', 'r') as file:
    annotations = file.read()

# to load the predefined list of image identfiers for training and validation set
def load_set(filename):
    """loads the set of identifiers in `filename`"""
    # read the file contents
    with open(filename, 'r') as file:
        doc = file.read()
    dataset = list()
    # process line by line
    for line in doc.split('\n'):
        # skip empty lines
        if len(line) < 1:
            continue
        # get the image identifier
        # identifier = line.split('.')[0]
        dataset.append(line)
    return set(dataset)

# load the train set identifiers
train_set = load_set('Dataset/Flickr30k/Flickr_30k.trainImages.txt')

# load the validation set identifiers
val_set = load_set('Dataset/Flickr30k/Flickr_30k.devImages.txt')

In [22]:
len(train_set), len(val_set)

(29783, 2000)

In [23]:
# get path to the image folder
PATH = os.path.abspath('.') + '/Dataset/Flickr30k/flickr30k-images'

# Store captions and image names in vectors
train_captions = []
img_name_train = []
val_captions = []
img_name_val = []

# splitting the file contents by line
for annot in annotations.split("\n"):
        # Skip empty lines
        if len(annot)<1:
            continue
        # separate out the caption from the line
        caption = annot.split()[1:]
        # add <start> and <end> token to the caption
        caption = "<start> " + ' '.join(caption) + " <end>"
        # separate out the image id from line)
        image_id = annot.split()[0]
        # remove caption number
        image_id = image_id.split('#')[0]
        # convert image id into the image path
        full_image_path = PATH + image_id

        # add the image id and caption in the repective lists
        if image_id in train_set:
            train_captions.append(caption)
            img_name_train.append(full_image_path)
        elif image_id in val_set:
            val_captions.append(caption)
            img_name_val.append(full_image_path)

In [24]:
len(train_captions), len(img_name_train), len(val_captions), len(img_name_val)

(148915, 148915, 10000, 10000)

In [25]:
train_captions, img_name_train = shuffle(train_captions, img_name_train, random_state = 1)

In [26]:
all_words=[]
for line in train_captions:
    for word in line.split():
        if word not in string.punctuation:
            all_words.append(word)

In [27]:
from collections import Counter
word_count = Counter(all_words)

In [28]:
word_count.most_common(8000)[-10:]

[('deserts', 4),
 ('inn', 4),
 ('anthem', 4),
 ('shanty', 4),
 ('cushions', 4),
 ('Interracial', 4),
 ('scrub', 4),
 ('flings', 4),
 ('Twenty', 4),
 ('Kiss', 4)]

In [29]:
len(word_count)

22796

In [30]:
from itertools import dropwhile
for key, count in dropwhile(lambda key_count: key_count[1] >= 5, word_count.most_common()):
     del word_count[key]

In [31]:
len(word_count)

7939

In [32]:
# Vocab_size=8k => removing all words appearing less than 5 times

### Prepare pretrained embedding matrix

1. create tokenizer
2. load glove vector
3. compare the 2 word vectors and get the words not in glove
4. replace tokenizer index with `<unk>` for those words

In [71]:
## 1. Create Tokenizer

# choose the top 5000 words from the vocabulary
top_k = 8000

# create tokenizer object that uses <unk> for out-of-vocabulary words and filters out all punctuations
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=top_k, oov_token='<unk>', filters = '!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ')

# fit tokenizer on train captions
tokenizer.fit_on_texts(train_captions)

# add token for padding
tokenizer.word_index['<pad>'] = 0
tokenizer.index_word[0] = '<pad>'

# convert train captions to tokenized sequences
train_seqs = tokenizer.texts_to_sequences(train_captions) # output shape = (number_of_captions, None)


In [9]:
## 2. Load Glove Vector

embeddings_index = dict()
f = open('../Datasets/glove.42B.300d.txt', 'r')

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype = np.float32)
    embeddings_index[word] = coefs

f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 1917494 word vectors.


In [22]:
## 3. Compare the two vocabularies and get the words not in glove

# All indexes >=8000 are referred to as <unk>. 
# Below vocab_size excludes <pad> as list in RHS has pad at the end of list and gets excluded after slicing
vocab_words = list(tokenizer.word_index.keys())[:7999]
glove_words = list(embeddings_index.keys())

not_in_glove = list()
for our_word in vocab_words:
    if our_word not in glove_words:
        not_in_glove.append(our_word)

len(not_in_glove)


9

In [21]:
list(tokenizer.word_index.keys())[-2:], list(tokenizer.word_index.keys())[7999]

(['whil', '<pad>'], 'waterskies')

In [65]:
list(tokenizer.word_index.keys())[7998]

'sheriff'

In [62]:
tokenizer.index_word[7999], tokenizer.word_index['outside'], list(tokenizer.word_index.keys())[-1]

('sheriff', 56, '<pad>')

In [23]:
not_in_glove = not_in_glove[3:]
not_in_glove

['rollerskaters',
 'surfboarder',
 'graffited',
 'parasailer',
 'outstreached',
 'ggauged']

In [73]:
## 4. replace tokenizer index with <unk> for those words
tokenizer.word_index['<unk>']

1

In [74]:
# that word is converted to <unk> token and that word's token is converted to <unk>
unk_idxs = []
for word in not_in_glove:
    index = tokenizer.word_index[word]
    unk_idxs.append(index)
    print(word, index)
    tokenizer.word_index[word] = 1
    tokenizer.index_word[index] = '<unk>'

rollerskaters 4525
surfboarder 5758
graffited 5768
parasailer 6096
outstreached 6752
ggauged 7953


In [75]:
unk_idxs

[4525, 5758, 5768, 6096, 6752, 7953]

In [84]:
# testing
tokenizer.texts_to_sequences(['He is a boy', 'rollerskaters surfboarder graffited parasailer outstreached ggauged test over'])

[[179, 10, 2, 31], [1, 1, 1, 1, 1, 1, 2720, 70]]

In [27]:
tokenizer.sequences_to_texts([[10, 20], [4525, 5758, 5768, 6096, 6752, 7953, 2000, 3000, 5000, 10000]])

['is wearing',
 '<unk> <unk> <unk> <unk> <unk> <unk> midst checkout parading <unk>']

In [89]:
# save the tokenizer
with open('tokenizer-8k-vocab.pkl', 'wb') as file:
    pickle.dump(tokenizer, file)

In [96]:
with open('unk-idxs.pkl', 'wb') as file:
    pickle.dump(unk_idxs, file)

In [90]:
# Test the saved files
with open('tokenizer-8k-vocab.pkl', 'rb') as file:
    token = pickle.load(file)

In [100]:
with open('unk-idxs.pkl', 'rb') as file:
    unk_list = pickle.load(file)

In [94]:
token.sequences_to_texts([[10, 20], [4525, 5758, 5768, 6096, 6752, 7953, 2000, 3000, 5000, 10000]])

['is wearing',
 '<unk> <unk> <unk> <unk> <unk> <unk> midst checkout parading <unk>']

In [95]:
token.texts_to_sequences(['He is a boy', 'rollerskaters surfboarder graffited parasailer outstreached ggauged test over'])

[[179, 10, 2, 31], [1, 1, 1, 1, 1, 1, 2720, 70]]

In [101]:
unk_list==unk_idxs

True

### Create weight matrix for Embedding layer

1. Create weight matrix for all words in word vec
2. Create special vectors for unique tokens
3. Extend the word tokens

In [3]:
# Test the saved files
with open('tokenizer-8k-vocab.pkl', 'rb') as file:
    tokenizer = pickle.load(file)

In [4]:
with open('unk-idxs.pkl', 'rb') as file:
    unk_idxs = pickle.load(file)

Special tokens: `<start>, <end>, <unk>, <pad>`   
1. For `<pad>`, emb vector would be a zero vector as there would be no training for it and it should not convey any meaning.   
2. For `<start>, <end> and <unk>`, special column would be created.

In [61]:
# embeddings_index > glove_embeddings dict
# embedding_matrix > weights matrix for our Embedding

# from tokenizer definition above
top_k=8000
embedding_dim=303

In [44]:
vocab_size = top_k
embedding_matrix = np.zeros((vocab_size, embedding_dim))
# iterate over each word in tokenizer, 
# get the pretrained weights, add the extra columns for special tokens 
# and replace its vector representation in embedding_matrix
for word, i in tokenizer.word_index.items():
    if i>=8000:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_vector = np.append(embedding_vector, np.zeros(3))
        embedding_matrix[i] = embedding_vector
    else:
        print("No vector found for:", word, i)

No vector found for: <unk> 1
No vector found for: <start> 3
No vector found for: <end> 4
No vector found for: rollerskaters 1
No vector found for: surfboarder 1
No vector found for: graffited 1
No vector found for: parasailer 1
No vector found for: outstreached 1
No vector found for: ggauged 1
No vector found for: <pad> 0


In [54]:
start_vector = np.zeros((embedding_dim,))
start_vector[300] = 1
end_vector = np.zeros((embedding_dim,))
end_vector[301] = 1
unk_vector = np.zeros((embedding_dim,))
unk_vector[302] = 1
start_vector[-4:], end_vector[-4:], unk_vector[-4:]

(array([0., 1., 0., 0.]), array([0., 0., 1., 0.]), array([0., 0., 0., 1.]))

In [62]:
embedding_matrix[3] = start_vector
embedding_matrix[4] = end_vector
embedding_matrix[1] = unk_vector

# Make all the words not in glove vocab as unk vector
for idx in unk_idxs:
    # print ((embedding_matrix[idx]==np.zeros((vocab_size, 303))).all())
    embedding_matrix[idx] = unk_vector

In [63]:
# Save the embedding matrix to use for embedding layer in model
with open("embedding_matix.pkl", 'wb') as file:
    pickle.dump(embedding_matrix, file)

In [64]:
# test the saved file
with open("embedding_matix.pkl", 'rb') as file:
    emb_mat = pickle.load(file) 

In [65]:
(emb_mat==embedding_matrix).all()

True