In [1]:
from nltk.corpus import stopwords
import re # To clean the tweets
import string
import numpy as np
import pickle

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [3]:
DATA_PATH = r"D:\GBC2\DL1\Project\Data\flickr_8k"

In [4]:
def save_pickle(file, file_name):
    pickle.dump(file, open(file_name + ".pkl", "wb"))

In [5]:
def read_caption(path):
    
    with open(path, 'r') as file:
        text = file.read()
    text = text.split('\n')[1:-1]
    print(f"Total number of captions: {len(text)}")
    return text

In [6]:
data = read_caption(DATA_PATH + r'\captions.txt')

Total number of captions: 40455


In [7]:
def img_caption_dic(text):
    
    captions = dict()
    for line in text:
        image_name  = line.split(',')[0]
        caption  = line.split(',')[1:]
        caption = " ".join(caption)
        image_name = image_name.split('.')[0]
        if image_name not in captions.keys():
            captions[image_name] = []
        captions[image_name].append(caption) 
    print(f"Total number of images: {len(captions)}")
    return captions

In [8]:
captions_dic = img_caption_dic(data)

Total number of images: 8091


In [9]:
save_pickle(captions_dic, "captions_dic")

In [9]:
def text_clean(s):
    """This function cleans the text
    Input: string to be cleaned
    Return: string after cleaning
    """
    words = [] # empty list
    
    s = s.strip().lower() # lower the string
    s = re.sub('\[.*?\]', '', s) # removes symbols (.*?\)
    s = re.sub('https?://\S+|www\.\S+', '', s) # remove URLS
    s = re.sub('<.*?>+', '', s)
    s = re.sub('[%s]' % re.escape(string.punctuation), '', s) # remove punctuations
    s = re.sub('\n', '', s) # remove next line character
    s = re.sub('\w*\d\w*', '', s) # remove alpha numeric words
    
    return s

In [10]:
def clean_captions(captions):
    
    for image_name, caption_list in captions.items():
        for i in range(len(caption_list)):
            clean_caption = text_clean(captions[image_name][i])
            captions[image_name][i] = clean_caption
        
    return captions

In [11]:
clean_captions_dic = clean_captions(captions_dic)

In [12]:
len(clean_captions_dic)

8091

In [13]:
count = 0
for v in clean_captions_dic.values():
    count += len(v)
print(count)

40455


In [14]:
def build_vocab(captions):
    vocab = []
    for caption_list in captions.values():
        vocab.append(caption_list)
        
    vocab = set(" ".join(np.array(vocab).ravel()).split())
    print(f"Total number of words in vocabulary: {len(vocab)}")
    return vocab

In [15]:
vocab = build_vocab(clean_captions_dic)

Total number of words in vocabulary: 8775


In [16]:
save_pickle(clean_captions_dic, "clean_captions_dic")

In [17]:
save_pickle(vocab, "vocab")

In [18]:
all_image_name = list(clean_captions_dic.keys())
total_images = len(all_image_name)
val_percent = 0.2
np.random.seed(42)
val_image_name = np.random.choice(all_image_name, size = int(total_images*val_percent), replace = False)
train_image_name = [name for name in all_image_name if name not in val_image_name]
print(f"Total number of images: {total_images}",f"\nNumber of images in training dataset: {len(train_image_name)}", f"\nNumber of images in validation dataset: {len(val_image_name)}")

Total number of images: 8091 
Number of images in training dataset: 6473 
Number of images in validation dataset: 1618


In [19]:
def train_val_captions(names):
    
    cap_dict = dict()
    for name in names:
        if name not in cap_dict.keys():
            cap_dict[name] = []
        caption_list = clean_captions_dic[name]
        for caption in caption_list:
            caption = "sos " + caption + " eos"
            cap_dict[name].append(caption)
            
    return cap_dict

In [20]:
train_captions = train_val_captions(train_image_name)
val_captions = train_val_captions(val_image_name)
print(len(train_image_name), len(val_image_name))

6473 1618


In [21]:
save_pickle(train_captions, 'train_captions')
save_pickle(val_captions, 'val_captions')

In [22]:
image_encodings = pickle.load(open("img_enc_dic.pkl", 'rb'))

In [23]:
def train_val_encodings(names):
    
    enc_dic = dict()
    for name in names:
        enc_dic[name] = image_encodings[name]
        
    return enc_dic

In [24]:
train_img_encodings = train_val_encodings(train_image_name)
val_img_encodings = train_val_encodings(val_image_name)
print(len(train_img_encodings), len(val_img_encodings))

6473 1618


In [25]:
save_pickle(train_img_encodings, 'train_img_encodings')
save_pickle(val_img_encodings, 'val_img_encodings')

In [26]:
def build_tokenizer(cap_dic):
    
    text = np.array(list(cap_dic.values())).ravel().tolist()
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(text)
    return tokenizer

In [27]:
tokenizer = build_tokenizer(train_captions)

In [28]:
save_pickle(tokenizer, 'tokenizer')

In [29]:
idx_word_dic ={value: key for key, value in tokenizer.word_index.items()}
save_pickle(idx_word_dic, 'idx_word_dic')

In [30]:
vocab_size = len(tokenizer.word_index) + 1

In [31]:
def create_sequences(cap_dic, tokenizer, max_length, vocab_size, encodings):
    x1 = []
    x2 = []
    y = []
    for name, caption_list in cap_dic.items():
#         print(name, caption_list)
        for caption in caption_list:
            sequences = tokenizer.texts_to_sequences([caption])[0]
            for i in range(1, len(sequences)):
                in_seq, out_seq = sequences[:i], sequences[i]
                in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
#                 print(in_seq, out_seq, encodings[name][0])
                x1.append(encodings[name][0])
                x2.append(in_seq)
                y.append(out_seq)
    
    return np.array(x1), np.array(x2), np.array(y)

In [32]:
def find_max_length(cap_dic):
    len_list = []
    for caption_list in cap_dic.values():
        for caption in caption_list:
            len_list.append(len(caption.split()))
            
    return max(len_list)

In [33]:
max_length = find_max_length(train_captions)

In [34]:
save_pickle(max_length, "max_length")

In [35]:
x1_train, x2_train, y_train = create_sequences(train_captions, tokenizer, max_length, vocab_size, train_img_encodings)

In [36]:
print(x1_train.shape, x2_train.shape, y_train.shape)

(381626, 4096) (381626, 37) (381626, 7984)


In [37]:
save_pickle(x1_train, 'x1_train')
save_pickle(x2_train, 'x2_train')
save_pickle(y_train, 'y_train')

In [38]:
x1_val, x2_val, y_val = create_sequences(val_captions, tokenizer, max_length, vocab_size, val_img_encodings)

In [39]:
print(x1_val.shape, x2_val.shape, y_val.shape)

(94181, 4096) (94181, 37) (94181, 7984)


In [40]:
save_pickle(x1_val, 'x1_val')
save_pickle(x2_val, 'x2_val')
save_pickle(y_val, 'y_val')