In [None]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from tqdm import tqdm
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import Sequence
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D, Activation, Dropout, Flatten, Dense, Input, Layer
from tensorflow.keras.layers import Embedding, LSTM, add, Concatenate, Reshape, concatenate, Bidirectional, RepeatVector
from tensorflow.keras.applications import VGG19, InceptionV3, ResNet50, DenseNet201
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from textwrap import wrap
from tensorflow.keras.utils import plot_model
from keras.layers import Lambda
from keras.utils import plot_model
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from sklearn.model_selection import train_test_split

In [None]:
# Load and preprocess the image data
train_images_list = os.listdir('flickr30k_images/flickr30k_images/flickr30k_images/')

data = pd.read_csv("flickr30k_images/results.csv", sep="|")
data.rename(columns={' comment': 'comment'}, inplace=True)
data.rename(columns={' comment_number': 'comment_number'}, inplace=True)

In [None]:
def readImage(path,img_size=224):
    img = load_img(path,color_mode='rgb',target_size=(img_size,img_size))
    img = img_to_array(img)
    img = img/255.
    
    return img

def display_images(temp_df):
    temp_df = temp_df.reset_index(drop=True)
    plt.figure(figsize = (20 , 20))
    n = 0
    for i in range(15):
        n+=1
        plt.subplot(5 , 5, n)
        plt.subplots_adjust(hspace = 0.7, wspace = 0.3)
        image = readImage(f"flickr30k_images/flickr30k_images/flickr30k_images/{temp_df['image_name'][i].strip()}")
        plt.imshow(image)
        plt.title("\n".join(wrap(temp_df['comment'][i].strip(), 20)))
        plt.axis("off")

In [None]:
display_images(data.sample(15))

In [None]:
def text_preprocessing(data):
    data['comment'] = data['comment'].astype(str)  # Convert all values to strings
    data['comment'] = data['comment'].apply(lambda x: x.lower())
    data['comment'] = data['comment'].apply(lambda x: x.replace("[^A-Za-z]",""))
    data['comment'] = data['comment'].apply(lambda x: x.replace("\\s+"," "))
    data['comment'] = data['comment'].apply(lambda x: " ".join([word for word in x.split() if len(word)>1]))
    data['comment'] = "startseq "+data['comment']+" endseq"
    return data

In [None]:
data = text_preprocessing(data)
captions = data['comment'].tolist()
captions[:10]

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(captions)
vocab_size = len(tokenizer.word_index) + 1
max_length = max(len(caption.split()) for caption in captions)

images = data['image_name'].unique().tolist()
nimages = len(images)

split_index = round(0.85*nimages)
train_images = images[:split_index]
val_images = images[split_index:]

train = data[data['image_name'].isin(train_images)]
test = data[data['image_name'].isin(val_images)]

train.reset_index(inplace=True,drop=True)
test.reset_index(inplace=True,drop=True)

tokenizer.texts_to_sequences([captions[1]])[0]

In [None]:
image_path = 'flickr30k_images/flickr30k_images/flickr30k_images/'

<h2> VGG16

In [None]:
from tensorflow.keras.applications.vgg16 import VGG16

# ... (previous code remains the same)

# VGG16 model
vgg_model = VGG16()
vgg_fe = Model(inputs=vgg_model.input, outputs=vgg_model.layers[-2].output)

img_size = 224
vgg_features = {}

for image in tqdm(data['image_name'].unique().tolist()):
    img = load_img(os.path.join(image_path, image), target_size=(img_size, img_size))
    img = img_to_array(img)
    img = img / 255.
    img = np.expand_dims(img, axis=0)
    
    vgg_feature = vgg_fe.predict(img, verbose=0)
    vgg_features[image] = vgg_feature



class CustomDataGenerator(Sequence):
    def __init__(self, df, X_col, y_col, batch_size, directory, tokenizer,
                 vocab_size, max_length, vgg_features, shuffle=True):
        self.vgg_features = vgg_features

    def __getitem__(self, index):
        batch = self.df.iloc[index * self.batch_size:(index + 1) * self.batch_size, :]
        X1, X2, y = self.__get_data(batch)
        return (X1, X2), y

    def __get_data(self, batch):
        X1, X2, y = list(), list(), list()
        images = batch[self.X_col].tolist()
        for image in images:
            feature = self.vgg_features[image][0]
            captions = batch.loc[batch[self.X_col] == image, self.y_col].tolist()
            for caption in captions:
                seq = self.tokenizer.texts_to_sequences([caption])[0]
                for i in range(1, len(seq)):
                    in_seq, out_seq = seq[:i], seq[i]
                    in_seq = pad_sequences([in_seq], maxlen=self.max_length)[0]
                    out_seq = to_categorical([out_seq], num_classes=self.vocab_size)[0]
                    X1.append(feature)
                    X2.append(in_seq)
                    y.append(out_seq)
        X1, X2, y = np.array(X1), np.array(X2), np.array(y)
        return X1, X2, y

input1 = Input(shape=(4096,))
input2 = Input(shape=(max_length,))

img_features = Dense(256, activation='relu')(input1)
img_features_reshaped = Reshape((1, 256), input_shape=(256,))(img_features)

sentence_features = Embedding(vocab_size, 256, mask_zero=False)(input2)

merged = concatenate([img_features_reshaped, sentence_features], axis=1)
sentence_features = LSTM(256)(merged)

x = Dropout(0.5)(sentence_features)
x = add([x, img_features])
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
output = Dense(vocab_size, activation='softmax')(x)

vgg_caption_model = Model(inputs=[input1, input2], outputs=output)
vgg_caption_model.compile(loss='categorical_crossentropy', optimizer='adam')

plot_model(vgg_caption_model)
vgg_caption_model.summary()



In [None]:
class CustomDataGenerator(Sequence):
    def __init__(self, df, X_col, y_col, batch_size, directory, tokenizer,
                 vocab_size, max_length, vgg_features, shuffle=True):
        self.df = df.copy()
        self.X_col = X_col
        self.y_col = y_col
        self.directory = directory
        self.batch_size = batch_size
        self.tokenizer = tokenizer
        self.vocab_size = vocab_size
        self.max_length = max_length
        self.vgg_features = vgg_features
        self.shuffle = shuffle
        self.n = len(self.df)

    def on_epoch_end(self):
        if self.shuffle:
            self.df = self.df.sample(frac=1).reset_index(drop=True)

    def __len__(self):
        return self.n // self.batch_size

    def __getitem__(self, index):
        batch = self.df.iloc[index * self.batch_size:(index + 1) * self.batch_size, :]
        X1, X2, y = self.__get_data(batch)
        return (X1, X2), y

    def __get_data(self, batch):
        X1, X2, y = list(), list(), list()
        images = batch[self.X_col].tolist()
        for image in images:
            feature = self.vgg_features[image][0]
            captions = batch.loc[batch[self.X_col] == image, self.y_col].tolist()
            for caption in captions:
                seq = self.tokenizer.texts_to_sequences([caption])[0]
                for i in range(1, len(seq)):
                    in_seq, out_seq = seq[:i], seq[i]
                    in_seq = pad_sequences([in_seq], maxlen=self.max_length)[0]
                    out_seq = to_categorical([out_seq], num_classes=self.vocab_size)[0]
                    X1.append(feature)
                    X2.append(in_seq)
                    y.append(out_seq)
        X1, X2, y = np.array(X1), np.array(X2), np.array(y)
        return X1, X2, y

In [None]:
train_generator = CustomDataGenerator(df=train, X_col='image_name', y_col='comment', batch_size=64, directory=image_path,
                                      tokenizer=tokenizer, vocab_size=vocab_size, max_length=max_length, vgg_features=vgg_features)

validation_generator = CustomDataGenerator(df=test, X_col='image_name', y_col='comment', batch_size=64, directory=image_path,
                                           tokenizer=tokenizer, vocab_size=vocab_size, max_length=max_length, vgg_features=vgg_features)

In [None]:
model_name = "vgg_model.h5"
checkpoint = ModelCheckpoint(model_name,
                             monitor="val_loss",
                             mode="min",
                             save_best_only=True,
                             verbose=1)

earlystopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=1, restore_best_weights=True)

learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss',
                                            patience=3,
                                            verbose=1,
                                            factor=0.2,
                                            min_lr=0.00000001)

In [None]:
history = vgg_caption_model.fit(
    train_generator,
    epochs=5,
    validation_data=validation_generator,
    callbacks=[checkpoint, earlystopping, learning_rate_reduction])

In [None]:
plt.figure(figsize=(20,8))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

<h2> InceptionV3

In [None]:
# InceptionV3 model
inception_model = InceptionV3()
inception_fe = Model(inputs=inception_model.input, outputs=inception_model.layers[-2].output)

img_size = 299  # InceptionV3 expects input images of size 299x299
inception_features = {}

for image in tqdm(data['image_name'].unique().tolist()):
    img = load_img(os.path.join(image_path, image), target_size=(img_size, img_size))
    img = img_to_array(img)
    img = img / 255.
    img = np.expand_dims(img, axis=0)
    
    inception_feature = inception_fe.predict(img, verbose=0)
    inception_features[image] = inception_feature

In [None]:
class CustomDataGenerator(Sequence):
    def __init__(self, df, X_col, y_col, batch_size, directory, tokenizer,
                 vocab_size, max_length, inception_features, shuffle=True):
        self.df = df.copy()
        self.X_col = X_col
        self.y_col = y_col
        self.directory = directory
        self.batch_size = batch_size
        self.tokenizer = tokenizer
        self.vocab_size = vocab_size
        self.max_length = max_length
        self.inception_features = inception_features
        self.shuffle = shuffle
        self.n = len(self.df)

    def on_epoch_end(self):
        if self.shuffle:
            self.df = self.df.sample(frac=1).reset_index(drop=True)

    def __len__(self):
        return self.n // self.batch_size

    def __getitem__(self, index):
        batch = self.df.iloc[index * self.batch_size:(index + 1) * self.batch_size, :]
        X1, X2, y = self.__get_data(batch)
        return (X1, X2), y

    def __get_data(self, batch):
        X1, X2, y = list(), list(), list()
        images = batch[self.X_col].tolist()
        for image in images:
            feature = self.inception_features[image][0]
            captions = batch.loc[batch[self.X_col] == image, self.y_col].tolist()
            for caption in captions:
                seq = self.tokenizer.texts_to_sequences([caption])[0]
                for i in range(1, len(seq)):
                    in_seq, out_seq = seq[:i], seq[i]
                    in_seq = pad_sequences([in_seq], maxlen=self.max_length)[0]
                    out_seq = to_categorical([out_seq], num_classes=self.vocab_size)[0]
                    X1.append(feature)
                    X2.append(in_seq)
                    y.append(out_seq)
        X1, X2, y = np.array(X1), np.array(X2), np.array(y)
        return X1, X2, y

In [None]:
input1 = Input(shape=(2048,))  # InceptionV3's second-to-last layer has 2048 units
input2 = Input(shape=(max_length,))

img_features = Dense(256, activation='relu')(input1)
img_features_reshaped = Reshape((1, 256), input_shape=(256,))(img_features)

sentence_features = Embedding(vocab_size, 256, mask_zero=False)(input2)

merged = concatenate([img_features_reshaped, sentence_features], axis=1)
sentence_features = LSTM(256)(merged)

x = Dropout(0.5)(sentence_features)
x = add([x, img_features])
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
output = Dense(vocab_size, activation='softmax')(x)

inception_caption_model = Model(inputs=[input1, input2], outputs=output)
inception_caption_model.compile(loss='categorical_crossentropy', optimizer='adam')

plot_model(inception_caption_model)
inception_caption_model.summary()

In [None]:
train_generator = CustomDataGenerator(df=train, X_col='image_name', y_col='comment', batch_size=64, directory=image_path,
                                      tokenizer=tokenizer, vocab_size=vocab_size, max_length=max_length, inception_features=inception_features)

validation_generator = CustomDataGenerator(df=test, X_col='image_name', y_col='comment', batch_size=64, directory=image_path,
                                           tokenizer=tokenizer, vocab_size=vocab_size, max_length=max_length, inception_features=inception_features)

model_name = "inception_model.h5"
checkpoint = ModelCheckpoint(model_name,
                             monitor="val_loss",
                             mode="min",
                             save_best_only=True,
                             verbose=1)

earlystopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=1, restore_best_weights=True)

learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss',
                                            patience=3,
                                            verbose=1,
                                            factor=0.2,
                                            min_lr=0.00000001)

history = inception_caption_model.fit(
    train_generator,
    epochs=5,
    validation_data=validation_generator,
    callbacks=[checkpoint, earlystopping, learning_rate_reduction])

In [None]:
plt.figure(figsize=(20,8))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

<h2> ResNet

In [None]:
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input

# Load the pre-trained ResNet50 model
resnet_model = ResNet50(weights='imagenet')
resnet_model = Model(inputs=resnet_model.input, outputs=resnet_model.layers[-2].output)

In [None]:
img_size = 224
resnet_features = {}

for image in tqdm(data['image_name'].unique().tolist()):
    img = load_img(os.path.join(image_path, image), target_size=(img_size, img_size))
    img = img_to_array(img)
    img = preprocess_input(img)  # Preprocess the image using ResNet50's preprocessing function
    img = np.expand_dims(img, axis=0)
    
    resnet_feature = resnet_model.predict(img, verbose=0)
    resnet_features[image] = resnet_feature

In [None]:
class CustomDataGenerator(Sequence):
    def __init__(self, df, X_col, y_col, batch_size, directory, tokenizer,
                 vocab_size, max_length, resnet_features, shuffle=True):
        self.df = df.copy()
        self.X_col = X_col
        self.y_col = y_col
        self.directory = directory
        self.batch_size = batch_size
        self.tokenizer = tokenizer
        self.vocab_size = vocab_size
        self.max_length = max_length
        self.resnet_features = resnet_features
        self.shuffle = shuffle
        self.n = len(self.df)

    def on_epoch_end(self):
        if self.shuffle:
            self.df = self.df.sample(frac=1).reset_index(drop=True)

    def __len__(self):
        return self.n // self.batch_size

    def __getitem__(self, index):
        batch = self.df.iloc[index * self.batch_size:(index + 1) * self.batch_size, :]
        X1, X2, y = self.__get_data(batch)
        return (X1, X2), y

    def __get_data(self, batch):
        X1, X2, y = list(), list(), list()
        images = batch[self.X_col].tolist()
        for image in images:
            feature = self.resnet_features[image][0]
            captions = batch.loc[batch[self.X_col] == image, self.y_col].tolist()
            for caption in captions:
                seq = self.tokenizer.texts_to_sequences([caption])[0]
                for i in range(1, len(seq)):
                    in_seq, out_seq = seq[:i], seq[i]
                    in_seq = pad_sequences([in_seq], maxlen=self.max_length)[0]
                    out_seq = to_categorical([out_seq], num_classes=self.vocab_size)[0]
                    X1.append(feature)
                    X2.append(in_seq)
                    y.append(out_seq)
        X1 = np.array(X1)
        X2 = np.array(X2)
        y = np.array(y)
        return X1, X2, y

In [None]:
input1 = Input(shape=(2048,))
input2 = Input(shape=(max_length,))

img_features = Dense(256, activation='relu')(input1)
img_features_reshaped = Reshape((1, 256), input_shape=(256,))(img_features)

sentence_features = Embedding(vocab_size, 256, mask_zero=False)(input2)

merged = concatenate([img_features_reshaped, sentence_features], axis=1)
sentence_features = LSTM(256)(merged)

x = Dropout(0.5)(sentence_features)
x = add([x, img_features])
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
output = Dense(vocab_size, activation='softmax')(x)

resnet_caption_model = Model(inputs=[input1, input2], outputs=output)
resnet_caption_model.compile(loss='categorical_crossentropy', optimizer='adam')

plot_model(resnet_caption_model)
resnet_caption_model.summary()

In [None]:
resnet_train_generator = CustomDataGenerator(df=train, X_col='image_name', y_col='comment', batch_size=64, directory=image_path,
                                             tokenizer=tokenizer, vocab_size=vocab_size, max_length=max_length, resnet_features=resnet_features)
resnet_validation_generator = CustomDataGenerator(df=test, X_col='image_name', y_col='comment', batch_size=64, directory=image_path,
                                                  tokenizer=tokenizer, vocab_size=vocab_size, max_length=max_length, resnet_features=resnet_features)

resnet_model_name = "resnet_model.h5"
resnet_checkpoint = ModelCheckpoint(resnet_model_name,
                                    monitor="val_loss",
                                    mode="min",
                                    save_best_only=True,
                                    verbose=1)

resnet_earlystopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=1, restore_best_weights=True)

resnet_learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss',
                                                   patience=3,
                                                   verbose=1,
                                                   factor=0.2,
                                                   min_lr=0.00000001)

resnet_history = resnet_caption_model.fit(
    resnet_train_generator,
    epochs=5,
    validation_data=resnet_validation_generator,
    callbacks=[resnet_checkpoint, resnet_earlystopping, resnet_learning_rate_reduction])

In [None]:
plt.figure(figsize=(20,8))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

<h2> Regularization

In [None]:
l1_reg = 0.01
l2_reg = 0.01

In [None]:
inception_caption_model = load_model('inception_model.h5')

for layer in inception_caption_model.layers:
    if hasattr(layer, 'kernel_regularizer'):
        layer.kernel_regularizer = l1_l2(l1=l1_reg, l2=l2_reg)

In [None]:
inception_caption_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
from tensorflow.keras.regularizers import l1, l2, l1_l2
from tensorflow.keras.models import load_model

# Define regularization parameters
l1_reg = 0.01
l2_reg = 0.01

# Load the Inception model
inception_caption_model = load_model('inception_model.h5')

# Apply regularization to the model layers
for layer in inception_caption_model.layers:
    if hasattr(layer, 'kernel_regularizer'):
        layer.kernel_regularizer = l1_l2(l1=l1_reg, l2=l2_reg)

# Compile the model
inception_caption_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Generate captions for each model
reg_samples = test.sample(10)


reg_samples['vgg_caption'] = reg_samples['image_name'].apply(lambda x: predict_caption(inception_caption_model, os.path.join(image_directory, x), tokenizer, max_length, inception_fe))

# Get the original captions from the 'data' dataframe
reg_original_captions = data.loc[data['image_name'].isin(reg_samples['image_name']), 'comment'].tolist()

# Calculate evaluation metrics for each model
reg_metrics = calculate_sequence_metrics(inception_caption_model, reg_samples['vgg_caption'].tolist())

print("Regularization Metrics:")
print(f"Accuracy: {reg_metrics[0]:.4f}")
print(f"Precision: {reg_metrics[1]:.4f}")
print(f"Recall: {reg_metricsreg_metrics[2]:.4f}")
print(f"F1 Score: {reg_metrics[3]:.4f}")