In [None]:
import os
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.densenet import preprocess_input as densenet_preprocess_input
from tensorflow.keras.applications.vgg16 import preprocess_input as vgg_preprocess_input
from tensorflow.keras.applications.inception_v3 import preprocess_input as inception_preprocess_input
from tensorflow.keras.applications.resnet50 import preprocess_input as resnet_preprocess_input
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
from statistics import median

def generate_caption(model, feature_model, image_path, tokenizer, max_length, preprocess_input):
    img = load_img(image_path, target_size=(224, 224))
    img = img_to_array(img)
    img = preprocess_input(img)
    img = np.expand_dims(img, axis=0)

    feature = feature_model.predict(img)

    caption = ''
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([caption])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)

        y_pred = model.predict([feature, sequence], verbose=0)
        y_pred = np.argmax(y_pred, axis=-1)

        word_index = y_pred[0]
        word = tokenizer.index_word[word_index]
        if word is None:
            break
        caption += ' ' + word
        if word == '':
            break

    return caption.strip()

def generate_captions_for_model(model, feature_model, model_name, image_paths, tokenizer, max_length, preprocess_input):
    captions = []
    
    for image_path in image_paths:
        caption = generate_caption(model, feature_model, image_path, tokenizer, max_length, preprocess_input)
        image_name = os.path.basename(image_path)
        image_number = int(os.path.splitext(image_name)[0])
        captions.append({'image_name': image_name, 'image_number': image_number, 'comment': caption})
    
    df = pd.DataFrame(captions)
    df.to_csv(f"finalCaptions_{model_name}.csv", index=False)
    return df

vgg_caption_model = load_model('vgg_model.h5')
vgg_fe = load_model('vgg19_feature_extractor.h5')

inception_caption_model = load_model('inception_model.h5')
inception_fe = load_model('inception_feature_extractor.h5')

resnet_caption_model = load_model('resnet_model.h5')
resnet_model = load_model('resnet_feature_extractor.h5')

data = pd.read_csv("flickr30k_images/results.csv", sep="|")
data.rename(columns={' comment': 'comment'}, inplace=True)
data.rename(columns={' comment_number': 'comment_number'}, inplace=True)

def text_preprocessing(data):
    data['comment'] = data['comment'].astype(str)  # Convert all values to strings
    data['comment'] = data['comment'].apply(lambda x: x.lower())
    data['comment'] = data['comment'].apply(lambda x: x.replace("[^A-Za-z]",""))
    data['comment'] = data['comment'].apply(lambda x: x.replace("\\s+"," "))
    data['comment'] = data['comment'].apply(lambda x: " ".join([word for word in x.split() if len(word)>1]))
    return data

data = text_preprocessing(data)
captions = data['comment'].tolist()
captions[:10]

from tensorflow.keras.preprocessing.text import Tokenizer
# Create the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['comment'])

# Set the maximum caption length
max_length = max(len(caption.split()) for caption in data['comment'])

image_path = 'flickr30k_images/flickr30k_images/flickr30k_images/'

test_image_paths = [os.path.join(image_path, image) for image in os.listdir(image_path)]

# Split the data into train and test sets
images = data['image_name'].unique().tolist()
nimages = len(images)
split_index = round(0.85 * nimages)
train_images = images[:split_index]
val_images = images[split_index:]
train = data[data['image_name'].isin(train_images)]
test = data[data['image_name'].isin(val_images)]
train.reset_index(inplace=True, drop=True)
test.reset_index(inplace=True, drop=True)

import random
# Select 100 random image paths from the test set
selected_image_paths = random.sample(test_image_paths, 100)

# Set the maximum caption length
max_length = 74

samples = test.sample(15)
samples.reset_index(drop=True,inplace=True)

vgg_caption_model.compile(loss='categorical_crossentropy', optimizer='adam')
inception_caption_model.compile(loss='categorical_crossentropy', optimizer='adam')
resnet_caption_model.compile(loss='categorical_crossentropy', optimizer='adam')

from tensorflow.keras.applications.vgg16 import preprocess_input

def idx_to_word(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

def predict_caption(model, image_path, tokenizer, max_length, vgg_fe):
    img = load_img(image_path, target_size=(224, 224))
    img = img_to_array(img)
    img = preprocess_input(img)
    img = np.expand_dims(img, axis=0)

    feature = vgg_fe.predict(img)

    in_text = ""
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], max_length)

        y_pred = model.predict([feature, sequence])
        y_pred = np.argmax(y_pred)

        word = idx_to_word(y_pred, tokenizer)

        if word is None:
            break

        in_text += " " + word

        

    return in_text

image_directory = 'flickr30k_images/flickr30k_images/flickr30k_images/'


for index, record in samples.iterrows():
    image_filename = record['image_name']
    image_path = os.path.join(image_directory, image_filename)
    caption = predict_caption(vgg_caption_model, image_path, tokenizer, max_length, vgg_fe)
    samples.loc[index, 'caption'] = caption
    
def readImage(path,img_size=224):
    img = load_img(path,color_mode='rgb',target_size=(img_size,img_size))
    img = img_to_array(img)
    img = img/255.
    
    return img

def display_images(temp_df):
    temp_df = temp_df.reset_index(drop=True)
    plt.figure(figsize = (20 , 20))
    n = 0

    for i in range(15):
        n+=1
        plt.subplot(5 , 5, n)
        plt.subplots_adjust(hspace = 0.7, wspace = 0.3)
        image = readImage(f"flickr30k_images/flickr30k_images/flickr30k_images/{temp_df['image_name'][i].strip()}")
        plt.imshow(image)
        plt.title("\n".join(wrap(temp_df['comment'][i].strip(), 20)))
        plt.axis("off")
        print("---------------------------------------------------")
        
import matplotlib.pyplot as plt
from textwrap import wrap
display_images(samples)

from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input

def idx_to_word(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

def predict_caption(model, image_path, tokenizer, max_length, inception_fe):
    img = load_img(image_path, target_size=(299, 299))  # InceptionV3 expects (299, 299) input size
    img = img_to_array(img)
    img = preprocess_input(img)
    img = np.expand_dims(img, axis=0)

    feature = inception_fe.predict(img)

    in_text = ""
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], max_length)

        y_pred = model.predict([feature, sequence])
        y_pred = np.argmax(y_pred)

        word = idx_to_word(y_pred, tokenizer)

        if word is None:
            break

        in_text += " " + word

        

    return in_text

image_directory = 'flickr30k_images/flickr30k_images/flickr30k_images/'


for index, record in samples.iterrows():
    image_filename = record['image_name']
    image_path = os.path.join(image_directory, image_filename)
    caption = predict_caption(inception_caption_model, image_path, tokenizer, max_length, inception_fe)
    samples.loc[index, 'caption'] = caption

import matplotlib.pyplot as plt
from textwrap import wrap
display_images(samples)

from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input

def idx_to_word(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

def predict_caption(model, image_path, tokenizer, max_length, resnet_fe):
    img = load_img(image_path, target_size=(224, 224))  # ResNet50 expects (224, 224) input size
    img = img_to_array(img)
    img = preprocess_input(img)
    img = np.expand_dims(img, axis=0)

    feature = resnet_fe.predict(img)

    in_text = ""
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], max_length)

        y_pred = model.predict([feature, sequence])
        y_pred = np.argmax(y_pred)

        word = idx_to_word(y_pred, tokenizer)

        if word is None:
            break

        in_text += " " + word

        

    return in_text

image_directory = 'flickr30k_images/flickr30k_images/flickr30k_images/'


for index, record in samples.iterrows():
    image_filename = record['image_name']
    image_path = os.path.join(image_directory, image_filename)
    caption = predict_caption(resnet_caption_model, image_path, tokenizer, max_length, resnet_model)
    samples.loc[index, 'caption'] = caption
    
import matplotlib.pyplot as plt
from textwrap import wrap
display_images(samples)

from nltk.translate.bleu_score import corpus_bleu

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Import necessary functions from nltk
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import corpus_bleu

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Import necessary functions from nltk
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import corpus_bleu

from nltk.translate.bleu_score import corpus_bleu

def calculate_bleu_score(references, hypotheses):
    filtered_references = []
    filtered_hypotheses = []
    for ref, hyp in zip(references, hypotheses):
        if ref and hyp:  # Check if both reference and hypothesis are non-empty
            filtered_references.append([ref.split()])
            filtered_hypotheses.append(hyp.split())
    bleu_score = corpus_bleu(filtered_references, filtered_hypotheses)
    return bleu_score

# Generate captions for each model
vgg_samples = test.sample(10)
inception_samples = test.sample(10)
resnet_samples = test.sample(10)

vgg_samples['vgg_caption'] = vgg_samples['image_name'].apply(lambda x: predict_caption(vgg_caption_model, os.path.join(image_directory, x), tokenizer, max_length, vgg_fe))
inception_samples['inception_caption'] = inception_samples['image_name'].apply(lambda x: predict_caption(inception_caption_model, os.path.join(image_directory, x), tokenizer, max_length, inception_fe))
resnet_samples['resnet_caption'] = resnet_samples['image_name'].apply(lambda x: predict_caption(resnet_caption_model, os.path.join(image_directory, x), tokenizer, max_length, resnet_model))

# Get the original captions from the 'data' dataframe
vgg_original_captions = data.loc[data['image_name'].isin(vgg_samples['image_name']), 'comment'].tolist()
inception_original_captions = data.loc[data['image_name'].isin(inception_samples['image_name']), 'comment'].tolist()
resnet_original_captions = data.loc[data['image_name'].isin(resnet_samples['image_name']), 'comment'].tolist()

# Calculate BLEU scores for each model
vgg_bleu_score = calculate_bleu_score(vgg_original_captions, vgg_samples['vgg_caption'].tolist())
inception_bleu_score = calculate_bleu_score(inception_original_captions, inception_samples['inception_caption'].tolist())
resnet_bleu_score = calculate_bleu_score(resnet_original_captions, resnet_samples['resnet_caption'].tolist())

print("VGG16 BLEU Score:", vgg_bleu_score)
print("InceptionV3 BLEU Score:", inception_bleu_score)
print("ResNet50 BLEU Score:", resnet_bleu_score)

# Find the best-performing model based on BLEU score
best_model = max(zip(['VGG16', 'InceptionV3', 'ResNet50'], [vgg_bleu_score, inception_bleu_score, resnet_bleu_score]), key=lambda x: x[1])
print("Best-performing model:", best_model[0])

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from nltk.translate.bleu_score import sentence_bleu
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from nltk.translate.bleu_score import sentence_bleu
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def calculate_sequence_metrics(references, hypotheses):
    y_true, y_pred = [], []
    bleu_scores = []
    for ref, hyp in zip(references, hypotheses):
        ref_tokens = ref.split()
        hyp_tokens = hyp.split()
        
        # Calculate BLEU score for each caption pair
        bleu_score = sentence_bleu([ref_tokens], hyp_tokens)
        bleu_scores.append(bleu_score)
    
    # Set the threshold as the median of BLEU scores
    threshold = median(bleu_scores)
    
    for ref, hyp, bleu_score in zip(references, hypotheses, bleu_scores):
        # Convert BLEU score to binary label (0 or 1)
        y_true.append(1)
        y_pred.append(int(bleu_score >= threshold))
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    return accuracy, precision, recall, f1


# Generate captions for each model
vgg_samples = test.sample(10)
inception_samples = test.sample(10)
resnet_samples = test.sample(10)

vgg_samples['vgg_caption'] = vgg_samples['image_name'].apply(lambda x: predict_caption(vgg_caption_model, os.path.join(image_directory, x), tokenizer, max_length, vgg_fe))
inception_samples['inception_caption'] = inception_samples['image_name'].apply(lambda x: predict_caption(inception_caption_model, os.path.join(image_directory, x), tokenizer, max_length, inception_fe))
resnet_samples['resnet_caption'] = resnet_samples['image_name'].apply(lambda x: predict_caption(resnet_caption_model, os.path.join(image_directory, x), tokenizer, max_length, resnet_model))

# Get the original captions from the 'data' dataframe
vgg_original_captions = data.loc[data['image_name'].isin(vgg_samples['image_name']), 'comment'].tolist()
inception_original_captions = data.loc[data['image_name'].isin(inception_samples['image_name']), 'comment'].tolist()
resnet_original_captions = data.loc[data['image_name'].isin(resnet_samples['image_name']), 'comment'].tolist()

# Calculate evaluation metrics for each model
vgg_metrics = calculate_sequence_metrics(vgg_original_captions, vgg_samples['vgg_caption'].tolist())
inception_metrics = calculate_sequence_metrics(inception_original_captions, inception_samples['inception_caption'].tolist())
resnet_metrics = calculate_sequence_metrics(resnet_original_captions, resnet_samples['resnet_caption'].tolist())

print("VGG16 Metrics:")
print(f"Accuracy: {vgg_metrics[0]:.4f}")
print(f"Precision: {vgg_metrics[1]:.4f}")
print(f"Recall: {vgg_metrics[2]:.4f}")
print(f"F1 Score: {vgg_metrics[3]:.4f}")

print("\nInceptionV3 Metrics:")
print(f"Accuracy: {inception_metrics[0]:.4f}")
print(f"Precision: {inception_metrics[1]:.4f}")
print(f"Recall: {inception_metrics[2]:.4f}")
print(f"F1 Score: {inception_metrics[3]:.4f}")

print("\nResNet50 Metrics:")
print(f"Accuracy: {resnet_metrics[0]:.4f}")
print(f"Precision: {resnet_metrics[1]:.4f}")
print(f"Recall: {resnet_metrics[2]:.4f}")
print(f"F1 Score: {resnet_metrics[3]:.4f}")

In [None]:
# Load your pre-trained Inception caption model
inception_caption_model = load_model('inception_model.h5')

# Load your pre-trained Inception feature extractor model
inception_fe = load_model('inception_feature_extractor.h5')

import os
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.inception_v3 import preprocess_input

# Function to load and preprocess an image
def load_and_preprocess_image(image_path):
    img = load_img(image_path, target_size=(299, 299))
    img = img_to_array(img)
    img = preprocess_input(img)
    img = np.expand_dims(img, axis=0)
    return img

# Prepare the data for the model
train_images = []
for img_path in train['image_name'].map(lambda x: os.path.join('flickr30k_images/flickr30k_images/flickr30k_images/', x)):
    train_images.append(load_and_preprocess_image(img_path))

train_images = np.vstack(train_images)
train_data = [inception_fe.predict(train_images), train_sequences]

val_images = []
for img_path in test['image_name'].map(lambda x: os.path.join('flickr30k_images/flickr30k_images/flickr30k_images/', x)):
    val_images.append(load_and_preprocess_image(img_path))

val_images = np.vstack(val_images)
val_data = [inception_fe.predict(val_images), val_sequences]

# Add custom layers on top of the caption model
input_shape = inception_caption_model.layers[-1].output_shape[1:]
x = inception_caption_model.layers[-1].output
x = Dense(1024, activation='relu')(x)
x = Dropout(0.3)(x)  # Add a Dropout layer with a rate of 0.3
x = Dense(1024, activation='relu')(x)
x = Dropout(0.3)(x)  # Add another Dropout layer

# Define the number of classes (vocabulary size)
num_classes = len(tokenizer.word_index) + 1  # Add 1 for the padding token

output = Dense(num_classes, activation='softmax')(x)

model = Model(inputs=inception_caption_model.input, outputs=output)

# Freeze the layers of the caption model and feature extractor
for layer in inception_caption_model.layers:
    layer.trainable = False
for layer in inception_fe.layers:
    layer.trainable = False

# Prepare the data for the model
train_data = [inception_fe.predict(train['image_name'].map(lambda x: os.path.join('flickr30k_images/flickr30k_images/flickr30k_images/', x))), train_sequences]
val_data = [inception_fe.predict(test['image_name'].map(lambda x: os.path.join('flickr30k_images/flickr30k_images/flickr30k_images/', x))), val_sequences]

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(train_data, train_labels, epochs=10, validation_data=(val_data, val_labels))