In [None]:
!pip install kagglehub

In [None]:
import os
import numpy as np
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, RepeatVector
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model, Sequential
import cv2
import matplotlib.pyplot as pt
import kagglehub

In [None]:
import kagglehub
path = kagglehub.dataset_download("adityajn105/flickr8k")
print(f"Path to dataset files: {path}")

In [None]:
caption_file = os.path.join(path, 'captions.txt')
image_folder = os.path.join(path, 'Images')

In [84]:
def load_data(image_folder, caption_file):
    image_paths, captions = [], []
    imgs = []
    with open(caption_file, 'r') as f:
        for line in f:
            line = line.strip().split(',')  # Assuming tab-separated file, adjust if comma
            image_name, caption = line[0], line[1]
            image_paths.append(os.path.join(image_folder, image_name))
            captions.append(caption)
            img = cv2.imread(os.path.join(image_folder, image_name))
            imgs.append(img)
    return imgs, image_paths, captions

In [None]:
imgs, image_paths, captions = load_data(image_folder, caption_file)

In [None]:
fig = pt.figure(figsize=(10,10))

for i in range(25):
    pt.subplot(5,5,i+1)
    pt.imshow(imgs[i])

In [25]:
image_paths = image_paths[1:12]

In [26]:
captions = captions[1:12]

In [None]:
cnn_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
cnn_model = Model(inputs=cnn_model.inputs, outputs=cnn_model.layers[-1].output)

In [64]:
import matplotlib.pyplot as plt

def extract_features(image):
    image = np.expand_dims(image, axis=0)  # Expand dims to simulate batch
    return cnn_model.predict(image)

In [65]:
def build_captioning_model(vocab_size, max_caption_length):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=256, input_length=max_caption_length))
    model.add(LSTM(256, return_sequences=True))
    model.add(Dense(vocab_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

In [66]:
vocab_size = 1000           # Dummy vocab size for captions
max_caption_length = 10      # Max length of captions

# Build and summarize the model
captioning_model = build_captioning_model(vocab_size, max_caption_length)
captioning_model.summary()

# Dummy data for demonstration
image = np.random.rand(224, 224, 3)  # Dummy image data
caption = np.random.randint(1, vocab_size, (1, max_caption_length))

In [67]:
image_features = extract_features(image)
print("Extracted Image Features:", image_features.shape)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 188ms/step
Extracted Image Features: (1, 7, 7, 512)


In [69]:
captioning_model.fit(caption, np.random.rand(1, max_caption_length, vocab_size), epochs=10,verbose=1)

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - loss: 3476.6304
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - loss: 3476.5754
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - loss: 3476.5261
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - loss: 3476.4832
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - loss: 3476.4497
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - loss: 3476.4324
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - loss: 3476.4570
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - loss: 3476.5845
Epoch 9/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - loss: 3476.9863
Epoch 10/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step - loss: 3

<keras.src.callbacks.history.History at 0x2271df0d010>

In [70]:
reference_captions = [
    ["a", "sample", "caption", "of", "an", "image"],
    ["another", "description", "of", "the", "image", "content"]
]

In [None]:
!pip install nltk

In [74]:
from nltk.translate.bleu_score import sentence_bleu

def evaluate_bleu(reference, candidate):
    reference = [reference]  # NLTK BLEU expects list of references
    return sentence_bleu(reference, candidate)

In [76]:
dummy_generated_caption = ["this", "is", "a", "generated", "caption"]
bleu_score = evaluate_bleu(reference_captions[0], dummy_generated_caption)
print("BLEU score for the generated caption:", bleu_score)

BLEU score for the generated caption: 1.1862177682648818e-231
