In [2]:
import os
import shutil as sh
import string

INIT_DIR = "initial_data/"
IMG_DIR = "images/"
CAP_FILE = "captions.txt"
TABLE = str.maketrans("\n", " ", string.punctuation)

# def retrieve_img_cap(main_dir):
#     for sub_dir in os.listdir(main_dir):
#         img_paths = []
#         cap_path = None
#         cur_dir = os.path.join(main_dir, sub_dir)

#         for file_ in os.listdir(cur_dir):
#             if ".txt" in file_:
#                 cap_path = file_
#             else:
#                 img_paths.append(file_)

#         for img in img_paths:
#             sh.move(os.path.join(cur_dir, img).replace("\\", "/"), os.path.join(IMG_DIR, img).replace("\\", "/"))

#             with open(os.path.join(cur_dir, cap_path), "r") as cap_file_tmp:
#                 cap = cap_file_tmp.read()
                
#             nxt_line = ",".join((img, cap.translate(TABLE)))

#             with open(CAP_FILE, "a") as cap_file:
#                 cap_file.write(nxt_line + "\n")

# retrieve_img_cap(INIT_DIR)

In [3]:
import numpy as np
import cv2

IMG_SIZE_INC = (299, 299)
IMG_SIZE_VGG = (224, 224)
images_inc = []
images_vgg = []
    
def preprocess_for_inc(img_name):    
    img_path = os.path.join(IMG_DIR, img_name)
    img = cv2.imread(img_path)
    img = cv2.resize(img, IMG_SIZE_INC, interpolation=cv2.INTER_LANCZOS4)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = img.astype(np.float32) / 255.0
    img = (img - 0.5) * 2.0
    
    return img

def preprocess_for_vgg(img_name):    
    img_path = os.path.join(IMG_DIR, img_name)
    img = cv2.imread(img_path)
    img = cv2.resize(img, IMG_SIZE_VGG, interpolation=cv2.INTER_LANCZOS4)
    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
    img = img.astype(np.float32) / 255.0
    img = (img - 0.5) * 2.0
    
    return img

def preprocess_images(IMG_DIR):
    for img_name in os.listdir(IMG_DIR):
        images_inc.append(preprocess_for_inc(img_name))
        images_vgg.append(preprocess_for_vgg(img_name))
        
preprocess_images(IMG_DIR)

images_inc = np.array(images_inc).astype(np.float32)
images_vgg = np.array(images_vgg).astype(np.float32)

In [4]:
import re

CAPTION_FILE = "captions.txt"

def prepare_captions(cap_file):
    with open(cap_file, 'r') as f:
        captions = f.readlines()
    
    caption_dict = {}
    max_cap_length = 0
    
    for caption in captions:
        image_id, caption_text = caption.split(',', 1)
        caption_text = caption_text.lower().strip()
        
        if len(re.split("\s+", caption_text)) + 2 > max_cap_length:
            max_cap_length = len(re.split("\s+", caption_text)) + 2
        
        if image_id not in caption_dict:
            caption_dict[image_id] = []
            
        caption_dict[image_id].append("<begin> "+" ".join(re.split("\s+", caption_text))+" <end>")
        
    return caption_dict, max_cap_length

CAP_DIC, MAX_CAP_LENGTH = prepare_captions(CAPTION_FILE)

In [5]:
train_images_inc = []
train_images_vgg = []

for img in images_inc[:int(0.8 * len(images_inc))]:
    train_images_inc.append(img)
    
for img in images_vgg[:int(0.8 * len(images_vgg))]:
    train_images_vgg.append(img)
        
test_images_inc = []
test_images_vgg = []

for img in images_inc[int(0.8 * len(images_inc)):]:
    test_images_inc.append(img)
    
for img in images_vgg[int(0.8 * len(images_vgg)):]:
    test_images_vgg.append(img)

train_images_inc = np.array(train_images_inc).astype(np.float32)
test_images_inc = np.array(test_images_inc).astype(np.float32)
train_images_vgg = np.array(train_images_vgg).astype(np.float32)
test_images_vgg = np.array(test_images_vgg).astype(np.float32)

In [6]:
train_captions = []
test_captions = []

for image_id, captions in CAP_DIC.items():
    if image_id in os.listdir(IMG_DIR):
        if image_id in os.listdir(IMG_DIR)[:int(0.8 * len(images_inc))]:
            train_captions.extend(captions)
        else:
            test_captions.extend(captions)

In [7]:
import nltk

tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
word_to_index = {}
index_to_word = {}
index = 1

for caption in train_captions:
    tokens = tokenizer.tokenize(caption)
    
    for token in tokens:
        if token not in word_to_index:
            word_to_index[token] = index
            index_to_word[index] = token
            index += 1

MAX_VOCAB = len(word_to_index) + 1

In [8]:
train_seq = []
test_seq = []

for caption in train_captions:
    tokens = tokenizer.tokenize(caption)
    sequence = []
    
    for token in tokens:
        if token in word_to_index:
            sequence.append(word_to_index[token])
            
    train_seq.append(sequence)
    
for caption in test_captions:
    tokens = tokenizer.tokenize(caption)
    sequence = []
    
    for token in tokens:
        if token in word_to_index:
            sequence.append(word_to_index[token])
            
    test_seq.append(sequence)

In [9]:
import tensorflow as tf

# train_sequences = tf.keras.preprocessing.sequence.pad_sequences(train_seq, maxlen=MAX_CAP_LENGTH, padding="post")
# test_sequences = tf.keras.preprocessing.sequence.pad_sequences(test_seq, maxlen=MAX_CAP_LENGTH, padding="post")

In [10]:
from tensorflow.keras.applications.inception_v3 import InceptionV3 as IV3
from tensorflow.keras.applications.vgg16 import VGG16 as V16
from tensorflow.keras.models import Model

inc_v3 = IV3(weights="imagenet")
inc_v3 = Model(inputs=inc_v3.inputs, outputs=inc_v3.layers[-2].output)

vgg_16 = V16(weights="imagenet")
vgg_16 = Model(inputs=vgg_16.inputs, outputs=vgg_16.layers[-2].output)

In [11]:
# feat_iv3 = inc_v3.predict(train_images_inc)
# np.save("inc_v3_ext_feat.npy", feat_iv3)
feat_iv3 = np.load("inc_v3_ext_feat.npy")

In [12]:
# feat_v16 = vgg_16.predict(train_images_vgg)
# np.save("vgg_16_ext_feat.npy", feat_v16)
feat_v16 = np.load("vgg_16_ext_feat.npy")

In [13]:
from tensorflow.keras.utils import to_categorical

def create_input_seq(unpadded_seq, features):
    for i in range(1, len(unpadded_seq)):
        in_, out = tf.keras.preprocessing.sequence.pad_sequences([unpadded_seq[:i]], maxlen=MAX_CAP_LENGTH, padding="post")[0], unpadded_seq[i]
        out = to_categorical([out], num_classes=MAX_VOCAB)[0]
        
        input_feat.append(features)
        input_seq.append(in_)
        output_seq.append(out)

In [15]:
### CHANGE THE CODE BELOW TO GET RESULTS FOR DIFFERENT COMBINATIONS ####

In [16]:
input_feat, input_seq, output_seq = [], [], []

for i in range(len(feat_iv3)):
    create_input_seq(train_seq[i], feat_iv3[i])
    # create_input_seq(train_seq[i], feat_v16[i])
    
input_feat = np.array(input_feat).astype(np.float32)
input_seq = np.array(input_seq).astype(np.float32)
output_seq = np.array(output_seq).astype(np.float32)

In [47]:
from tensorflow.keras.layers import Input, Dropout, Dense, LSTM, Bidirectional, Embedding, add 
from tensorflow.keras.utils import plot_model

# feat_ext_in = Input(shape=(2048,))
# # feat_ext_in = Input(shape=(4096,))
# feat_ext_dropout = Dropout(0.5)(feat_ext_in)
# feat_ext_dense = Dense(512, activation='relu')(feat_ext_dropout)
# encode_seq_in = Input(shape=(MAX_CAP_LENGTH,))
# encode_seq_emb = Embedding(MAX_VOCAB, 256, mask_zero=True)(encode_seq_in)
# encode_seq_dropout = Dropout(0.5)(encode_seq_emb)
# # encode_seq_bi_lstm = Bidirectional(LSTM(256))(encode_seq_dropout)
# encode_seq_bi_lstm = LSTM(512)(encode_seq_dropout)
# cap_gen_decoder = add([feat_ext_dense, encode_seq_bi_lstm])
# cap_gen_decoder_dense = Dense(256, activation='relu')(cap_gen_decoder)
# cap_gen_out = Dense(MAX_VOCAB, activation='softmax')(cap_gen_decoder_dense)
# merged_cap_gen = Model(inputs=[feat_ext_in, encode_seq_in], outputs=cap_gen_out)
# merged_cap_gen.compile(loss='categorical_crossentropy', optimizer='adam')
# merged_cap_gen = tf.keras.models.load_model("inception_v3_bi_lstm")
# merged_cap_gen = tf.keras.models.load_model("vgg_16_bi_lstm")
merged_cap_gen = tf.keras.models.load_model("inception_v3_lstm")
# merged_cap_gen = tf.keras.models.load_model("vgg_16_lstm")

In [18]:
# merged_cap_gen_history = merged_cap_gen.fit([input_feat, input_seq], output_seq, batch_size=64, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [19]:
# merged_cap_gen.save("inception_v3_bi_lstm")
# merged_cap_gen.save("vgg_16_bi_lstm")
# merged_cap_gen.save("inception_v3_lstm")
# merged_cap_gen.save("vgg_16_lstm")



INFO:tensorflow:Assets written to: inception_v3_lstm\assets


INFO:tensorflow:Assets written to: inception_v3_lstm\assets


In [48]:
test_feat = inc_v3.predict(test_images_inc)
# test_feat = vgg_16.predict(test_images_vgg)

# test_in_feat, test_in_seq, test_out_seq = [], [], []

# def create_test_seq(unpadded_seq, features):
#     for i in range(1, len(unpadded_seq)):
#         in_, out = tf.keras.preprocessing.sequence.pad_sequences([unpadded_seq[:i]], maxlen=MAX_CAP_LENGTH, padding="post")[0], unpadded_seq[i]
#         out = to_categorical([out], num_classes=MAX_VOCAB)[0]
        
#         test_in_feat.append(features)
#         test_in_seq.append(in_)
#         test_out_seq.append(out)

# for i in range(len(test_feat)):
#     create_test_seq(test_seq[i], test_feat[i])
    
# test_in_feat = np.array(test_in_feat).astype(np.float32)
# test_in_seq = np.array(test_in_seq).astype(np.float32)
# test_out_seq = np.array(test_out_seq).astype(np.float32)



In [49]:
predicted_captions = []

for i in range(1, len(test_feat)+1):
    pred_cap = [1]
    
    for j in range(1, MAX_CAP_LENGTH):
        next_word = merged_cap_gen.predict([test_feat[i-1:i], np.array([pred_cap]).astype(np.float32)])
        next_word = np.argmax(next_word)
        pred_cap.append(next_word)
        
        if pred_cap[-1] == 29:
            break
    
    new_cap = ""
    
    for seq in pred_cap[1:-1]:
        new_cap += " " + index_to_word[seq]
        
    predicted_captions.append(new_cap[1:])



In [50]:
for i, j in zip(test_captions, predicted_captions):
#     with open("comparison_iv3_bi_lstm.txt", "a") as f:
#         f.write("Actual: " + " ".join(i.split(" ")[1:-1]) + "\nPredicted: " + j + "\n--\n")
        
    # with open("comparison_v16_bi_lstm.txt", "a") as f:
    #     f.write("Actual: " + " ".join(i.split(" ")[1:-1]) + "\nPredicted: " + j + "\n--\n")
    
    with open("comparison_iv3_lstm.txt", "a") as f:
        f.write("Actual: " + " ".join(i.split(" ")[1:-1]) + "\nPredicted: " + j + "\n--\n")
    
    # with open("comparison_v16_bi_lstm.txt", "a") as f:
    #     f.write("Actual: " + " ".join(i.split(" ")[1:-1]) + "\nPredicted: " + j + "\n--\n")

In [51]:
total = 0
correct = 0
incorrect = 0

for i, j in zip(test_captions, predicted_captions):
    total += len(i.split(" ")[1:-1])
    for word in i.split(" ")[1:-1]:
        if word in j:
            correct += 1
            
for i, j in zip(test_captions, predicted_captions):
    for word in j.split(" "):
        if word not in i:
            incorrect += 1

In [52]:
print(f"Total {total} Words among those {correct} words from actual captions are also present in the predicted captions having a percentage of {correct/total*100.0}% and {incorrect} words from predicted captions are not present in the actual captions having a percentage of {incorrect/total*100.0}%")

Total 2167 Words among those 1633 words from actual captions are also present in the predicted captions having a percentage of 75.3576372865713% and 705 words from predicted captions are not present in the actual captions having a percentage of 32.533456391324414%
