In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers import LSTM, Dense, Embedding, Input, Attention
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.models import Model
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
from PIL import Image
import cv2




In [2]:
# Define paths to your dataset
images_dir = ".\Dataset\Images"
captions_file = ".\Dataset\captions.txt"

In [3]:
# Load captions into a DataFrame
captions_df = pd.read_csv(captions_file, delimiter='\t', header=None, names=['image', 'caption'])

In [4]:
# Display sample captions
print("Sample Captions:")
print(captions_df.head())

Sample Captions:
                                               image  caption
0  1000268201_693b08cb0e.jpg,A child in a pink dr...      NaN
1  1000268201_693b08cb0e.jpg,A girl going into a ...      NaN
2  1000268201_693b08cb0e.jpg,A little girl climbi...      NaN
3  1000268201_693b08cb0e.jpg,A little girl climbi...      NaN
4  1000268201_693b08cb0e.jpg,A little girl in a p...      NaN


In [5]:
#preprocess the image

def load_image(image_path):
    img = cv2.imread(image_path)
    # Resize image to a fixed size (e.g., 224x224)
    img = cv2.resize(img, (256, 256))
    # Normalize pixel values to [0, 1]
    img = img.astype(np.float32) / 255.0
    return img

In [6]:
# Preprocess all images and store in a dictionary

preprocessed_images = {}

for image_file in os.listdir(images_dir):
    image_path = os.path.join(images_dir, image_file)
    image_id = image_file.split('.')[0]
    preprocessed_images[image_id] = load_image(image_path)

In [7]:
# number of preprocessed images

print(f"Number of preprocessed images: {len(preprocessed_images)}")

Number of preprocessed images: 8091


In [8]:
# Load captions into a DataFrame
captions_data = []

with open(captions_file, "r") as file:
    for line in file:
        image_id, caption = line.strip().split(",", 1)
        captions_data.append({"image_id": image_id, "caption": caption})
        

captions_df = pd.DataFrame(captions_data)

In [9]:
# Tokenize captions

tokenizer = Tokenizer()
tokenizer.fit_on_texts(captions_df["caption"].values)
vocab_size = len(tokenizer.word_index) + 1

In [19]:
# Convert captions to sequences of integers
sequences = tokenizer.texts_to_sequences(captions_df["caption"].values)

# Pad sequences to ensure uniform length
max_sequence_length = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding="post")
print(max_sequence_length)


37


In [11]:
# Map image filenames to their corresponding padded sequences

image_seqs_mapping = {}
for idx, row in captions_df.iterrows():
    image_seqs_mapping[row["image_id"]] = padded_sequences[idx]

In [12]:
# Load pre-trained ResNet50 model without the classification head

base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(256, 256, 3))

# Create a new model with the desired output layer

feature_extractor = Model(inputs=base_model.input, outputs=base_model.output)





In [13]:
# Extract features from images and store in a dictionary

extracted_features = {}

for image_filename, img_array in preprocessed_images.items():
    img_input = preprocess_input(np.expand_dims(img_array, axis=0))
    features = feature_extractor.predict(img_input)
    extracted_features[image_filename] = features.flatten()



In [14]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\PRADHISHWARRAN.
[nltk_data]     T\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [15]:
tokenized_captions = []

for caption in captions_df["caption"]:
    tokens = word_tokenize(caption.lower())  # Convert to lowercase and tokenize
    tokenized_captions.append(tokens)

# Now, tokenized_captions is a list of lists, where each inner list contains tokens for a caption

In [16]:
# Flatten the list of tokenized captions
all_tokens = [token for caption_tokens in tokenized_captions for token in caption_tokens]

# Count the frequency of each token
token_counter = Counter(all_tokens)

# Create a vocabulary with unique indices for each token
vocab = ['<PAD>', '<SOS>', '<EOS>'] + [token for token, _ in token_counter.items()]

# Create a dictionary to map tokens to indices and vice versa
token_to_index = {token: index for index, token in enumerate(vocab)}
index_to_token = {index: token for token, index in token_to_index.items()}

# Maximum sequence length (length of the longest caption)
max_seq_length = max(len(tokens) for tokens in tokenized_captions)

# Pad sequences of tokens to ensure uniform length
padded_sequences = []
for tokens in tokenized_captions:
    # Add start-of-sequence token
    tokens_with_sos = ['<SOS>'] + tokens
    # Add end-of-sequence token and pad sequence
    padded_sequence = tokens_with_sos[:max_seq_length] + ['<PAD>'] * (max_seq_length - len(tokens_with_sos) + 1)
    padded_sequences.append(padded_sequence)

# Convert tokens to indices
indexed_sequences = [[token_to_index[token] for token in tokens] for tokens in padded_sequences]

In [17]:
# Assuming you have a DataFrame called 'captions_df' containing image IDs and captions
# Extract unique image IDs
image_ids = captions_df['image_id'].unique()

# Split the dataset into training and the rest (validation + test)
train_image_ids, val_test_image_ids = train_test_split(image_ids, test_size=0.3, random_state=42)

# Further split the remaining data into validation and test sets
val_image_ids, test_image_ids = train_test_split(val_test_image_ids, test_size=0.5, random_state=42)

# Now, filter captions based on the split image IDs
train_captions = captions_df[captions_df['image_id'].isin(train_image_ids)]
val_captions = captions_df[captions_df['image_id'].isin(val_image_ids)]
test_captions = captions_df[captions_df['image_id'].isin(test_image_ids)]

In [22]:
# Define the encoder (using a pre-trained CNN model)
encoder_inputs = layers.Input(shape=(256, 256, 3))
encoder = ResNet50(weights='imagenet', include_top=False)
encoder_outputs = encoder(encoder_inputs)

# Add a dense layer to transform encoder_outputs
encoder_dense = layers.Dense(256, activation='relu')(encoder_outputs)
encoder_states = [encoder_dense, encoder_dense]
embedding_dim = 256

# Define the decoder (using LSTM)
decoder_inputs = layers.Input(shape=(max_sequence_length,))
embedding = layers.Embedding(vocab_size, embedding_dim, mask_zero=True)(decoder_inputs)

# Initialize initial state for the LSTM layer
#initial_state = [layers.Input(shape=(256,)), layers.Input(shape=(256,))]  # Matching LSTM state size [256, 256]

decoder_lstm = layers.LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(embedding, initial_state = encoder_states) 


ValueError: An `initial_state` was passed that is not compatible with `cell.state_size`. Received `state_spec`=ListWrapper([InputSpec(shape=(None, 8, 8, 256), ndim=4), InputSpec(shape=(None, 8, 8, 256), ndim=4)]); however `cell.state_size` is [256, 256]

In [79]:
# Attention mechanism

attention = layers.Attention()
context_vector = attention([decoder_outputs, encoder_outputs])
decoder_combined_context = layers.Concatenate(axis=-1)([decoder_outputs, context_vector])

ValueError: Exception encountered when calling layer "attention_2" (type Attention).

Dimensions must be equal, but are 256 and 2048 for '{{node attention_2/MatMul}} = BatchMatMulV2[T=DT_FLOAT, adj_x=false, adj_y=true](Placeholder, Placeholder_1)' with input shapes: [?,20,256], [?,8,8,2048].

Call arguments received by layer "attention_2" (type Attention):
  • inputs=['tf.Tensor(shape=(None, 20, 256), dtype=float32)', 'tf.Tensor(shape=(None, 8, 8, 2048), dtype=float32)']
  • mask=['tf.Tensor(shape=(None, 20), dtype=bool)', 'None']
  • training=None
  • return_attention_scores=False
  • use_causal_mask=False

In [None]:
# Output layer
output = layers.Dense(vocab_size, activation='softmax')(decoder_outputs)

# Define the model
model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy')