In [1]:
import pandas as pd

captions_file = 'dataset/captions.txt'
captions_df = pd.read_csv(captions_file)

print(captions_df.head())

captions_dict = {}
for idx, row in captions_df.iterrows():
    img_name, caption = row['image'], row['caption']
    if img_name not in captions_dict:
        captions_dict[img_name] = []
    captions_dict[img_name].append(caption)

print(f"Loaded {len(captions_dict)} images with captions.")
for img_name, captions in list(captions_dict.items())[:5]:
    print(f"Image: {img_name}")
    for caption in captions:
        print(f" - {caption}")

                       image  \
0  1000268201_693b08cb0e.jpg   
1  1000268201_693b08cb0e.jpg   
2  1000268201_693b08cb0e.jpg   
3  1000268201_693b08cb0e.jpg   
4  1000268201_693b08cb0e.jpg   

                                             caption  
0  A child in a pink dress is climbing up a set o...  
1              A girl going into a wooden building .  
2   A little girl climbing into a wooden playhouse .  
3  A little girl climbing the stairs to her playh...  
4  A little girl in a pink dress going into a woo...  
Loaded 8091 images with captions.
Image: 1000268201_693b08cb0e.jpg
 - A child in a pink dress is climbing up a set of stairs in an entry way .
 - A girl going into a wooden building .
 - A little girl climbing into a wooden playhouse .
 - A little girl climbing the stairs to her playhouse .
 - A little girl in a pink dress going into a wooden cabin .
Image: 1001773457_577c3a7d70.jpg
 - A black dog and a spotted dog are fighting
 - A black dog and a tri-colored dog playing 

In [None]:
import os
import numpy as np
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.models import Model
from PIL import Image

image_folder = 'dataset/Images_resized/'  

inception_model = InceptionV3(weights='imagenet', include_top=False)
model = Model(inception_model.input, inception_model.layers[-1].output)

def preprocess_image(image_path, target_size=(299, 299)):
    try:
        img = Image.open(image_path).resize(target_size)
        img = np.array(img) / 127.5 - 1.0  
        img = np.expand_dims(img, axis=0)  
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        return None

def extract_features(image_path, model):
    img = preprocess_image(image_path)
    if img is not None:
        feature_vector = model.predict(img)  
        return feature_vector
    else:
        return None

image_features_dict = {}
for filename in os.listdir(image_folder):
    if filename.endswith('.jpg'):
        image_path = os.path.join(image_folder, filename)
        features = extract_features(image_path, model)
        if features is not None:
            image_features_dict[filename] = features  
        print(f"Processed: {filename}")

import pickle
with open('image_features.pkl', 'wb') as f:
    pickle.dump(image_features_dict, f)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m87910968/87910968[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 0us/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step
Processed: 1000268201_693b08cb0e.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 184ms/step
Processed: 1001773457_577c3a7d70.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 154ms/step
Processed: 1002674143_1b742ab4b8.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 183ms/step
Processed: 1003163366_44323f5815.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 172ms/step
Processed: 1007129816_e794419615.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 178ms/step
Processed: 1007320043_627395c3d8.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 195ms/step
Processed: 1009434119_febe49276

In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

all_captions = [caption for captions in captions_dict.values() for caption in captions]

tokenizer = Tokenizer(num_words=5000, oov_token='<unk>')
tokenizer.fit_on_texts(all_captions)

with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

max_length = 20  

def preprocess_captions(captions, tokenizer, max_length):
    seqs = tokenizer.texts_to_sequences(captions)
    return pad_sequences(seqs, maxlen=max_length, padding='post')

example_img = list(captions_dict.keys())[0]
processed_captions = preprocess_captions(captions_dict[example_img], tokenizer, max_length)
print(processed_captions)

[[   2   42    3    2   89  171    6  118   50    2  392   11  393    3
    27    1  669    0    0    0]
 [   2   18  315   63    2  195  116    0    0    0    0    0    0    0
     0    0    0    0    0    0]
 [   2   39   18  118   63    2  195 2436    0    0    0    0    0    0
     0    0    0    0    0    0]
 [   2   39   18  118    4  393   19   59 2436    0    0    0    0    0
     0    0    0    0    0    0]
 [   2   39   18    3    2   89  171  315   63    2  195 2980    0    0
     0    0    0    0    0    0]]


In [32]:
train_image_features = []
train_captions = []

for img_name, captions in captions_dict.items():
    if img_name in image_features_dict:
        processed_captions = preprocess_captions(captions, tokenizer, max_length)
        image_features = image_features_dict[img_name]
        
        if image_features.shape == (1, 8, 8, 2048):
            image_features = np.squeeze(image_features, axis=0)  
        
        if image_features.shape == (8, 8, 2048):
            image_features = np.mean(image_features, axis=(0, 1))  
        
        assert image_features.shape == (2048,), f"Unexpected shape after processing: {image_features.shape}"
        
        for caption in processed_captions:
            train_image_features.append(image_features)
            train_captions.append(caption)

train_image_features = np.array(train_image_features)
train_captions = np.array(train_captions)

print(f"Training data: {train_image_features.shape}, {train_captions.shape}")

Training data: (40455, 2048), (40455, 20)


In [33]:
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, concatenate, Input
from tensorflow.keras.models import Model
from tensorflow.keras.metrics import Accuracy


def define_caption_model(vocab_size, embedding_dim, max_length):
    inputs1 = Input(shape=(2048,))  
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(embedding_dim, activation='relu')(fe1)

    inputs2 = Input(shape=(max_length,))  
    se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
    se2 = LSTM(512)(se1)

    decoder1 = concatenate([fe2, se2])
    decoder2 = Dense(512, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)

    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 256

caption_model = define_caption_model(vocab_size, embedding_dim, max_length)
caption_model.summary()

In [43]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

def data_generator(image_features, captions, batch_size, max_length, vocab_size):
    while True:
        for i in range(0, len(captions), batch_size):
            batch_image_features = image_features[i:i+batch_size]
            batch_captions = captions[i:i+batch_size]
            
            X1, X2, y = [], [], []
            for j in range(len(batch_captions)):
                caption = batch_captions[j]
                image_feature = batch_image_features[j]  
                
                for k in range(1, len(caption)):
                    input_sequence = caption[:k]
                    target_word = caption[k]
                    
                    input_sequence = pad_sequences([input_sequence], maxlen=max_length, padding='post')[0]
                    
                    X1.append(image_feature)
                    X2.append(input_sequence)
                    
                    y.append(target_word)
            
            X1 = np.array(X1, dtype=np.float32)  
            X2 = np.array(X2, dtype=np.int32)    
            y = np.array(y, dtype=np.int32)      
            
            yield (X1, X2), y  

In [44]:
def create_dataset(image_features, captions, batch_size, max_length, vocab_size):
    dataset = tf.data.Dataset.from_generator(
        lambda: data_generator(image_features, captions, batch_size, max_length, vocab_size),
        output_signature=(
            (
                tf.TensorSpec(shape=(None, 2048), dtype=tf.float32),  
                tf.TensorSpec(shape=(None, max_length), dtype=tf.int32)  
            ),
            tf.TensorSpec(shape=(None,), dtype=tf.int32)  
        )
    )
    return dataset

In [45]:
from sklearn.model_selection import train_test_split

train_image_features, val_image_features, train_captions, val_captions = train_test_split(
    train_image_features, train_captions, test_size=0.2, random_state=42)

In [46]:
batch_size = 64
max_length = 20  
vocab_size = len(tokenizer.word_index) + 1  

train_dataset = create_dataset(train_image_features, train_captions, batch_size, max_length, vocab_size)
val_dataset = create_dataset(val_image_features, val_captions, batch_size, max_length, vocab_size)

train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)
val_dataset = val_dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [47]:
caption_model.compile(
    loss='sparse_categorical_crossentropy',  
    optimizer='adam',
    metrics=['accuracy']  
)

In [49]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model_checkpoint = ModelCheckpoint('best_model.keras', monitor='val_loss', save_best_only=True)

caption_model.fit(
    train_dataset,
    steps_per_epoch=steps_per_epoch,
    epochs=20,
    validation_data=val_dataset,
    validation_steps=validation_steps,
    callbacks=[early_stopping, model_checkpoint]  
)

Epoch 1/20
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1467s[0m 2s/step - accuracy: 0.5195 - loss: 3.2954 - val_accuracy: 0.6029 - val_loss: 2.1647
Epoch 2/20
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1433s[0m 2s/step - accuracy: 0.6133 - loss: 2.0361 - val_accuracy: 0.6218 - val_loss: 1.9820
Epoch 3/20
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1430s[0m 2s/step - accuracy: 0.6307 - loss: 1.8101 - val_accuracy: 0.6310 - val_loss: 1.9308
Epoch 4/20
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1408s[0m 2s/step - accuracy: 0.6460 - loss: 1.6403 - val_accuracy: 0.6354 - val_loss: 1.9141
Epoch 5/20
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1375s[0m 2s/step - accuracy: 0.6619 - loss: 1.5041 - val_accuracy: 0.6276 - val_loss: 1.9668
Epoch 6/20
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1348s[0m 2s/step - accuracy: 0.6754 - loss: 1.4011 - val_accuracy: 0.6166 - val_loss: 2.0840
Epoch 7/20
[1m6

<keras.src.callbacks.history.History at 0x1aa70f06f90>

In [50]:
# caption_model.save('caption_model.h5')



In [51]:
caption_model.save('caption_model.keras')