In [20]:
import numpy as np
import tensorflow as tf
import pickle
import random
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Add
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Step 1: Generate synthetic captions (for demonstration purposes)
subjects = [
    "a man", "a woman", "a child", "a boy", "a girl", "a dog", "a cat", "a bird", "a robot", "a cyclist",
    "an artist", "a teacher", "a chef", "a student", "a scientist", "a musician", "a photographer", 
    "a farmer", "a soldier", "an astronaut"
]

verbs = [
    "is running", "is jumping", "is playing", "is driving", "is eating", "is sleeping", "is flying", 
    "is painting", "is singing", "is teaching", "is walking", "is exploring", "is climbing", 
    "is working", "is building", "is designing", "is studying", "is observing", "is shopping", "is relaxing"
]

objects = [
    "in the park", "on the road", "at home", "in the sky", "in the kitchen", "on a mountain", "on a beach", 
    "in the forest", "by the river", "in a school", "in a city", "on a boat", "in a car", "on a plane", 
    "in the desert", "on a rooftop", "in a garden", "under the stars", "on a bridge", "at a market"
]

adjectives = ["happy", "angry", "excited", "tired", "curious"]
adverbs = ["quickly", "gracefully", "loudly", "eagerly", "slowly"]

captions = []
for _ in range(10000):  # Generate 10000 synthetic captions
    subject = random.choice(subjects)
    verb = random.choice(verbs)
    object_ = random.choice(objects)
    adjective = random.choice(adjectives)
    adverb = random.choice(adverbs)

    caption = f"startseq {adjective} {subject} {verb} {adverb} {object_} endseq"
    captions.append(caption)

# Step 2: Initialize the tokenizer and fit on the captions
tokenizer = Tokenizer()
tokenizer.fit_on_texts(captions)

# Save the tokenizer
with open("tokenizer.pkl", "wb") as file:
    pickle.dump(tokenizer, file)

print(f"Tokenizer saved with {len(tokenizer.word_index)} words.")

# Step 3: Define parameters for the captioning model
max_sequence_length = 30
embedding_dim = 256
vocab_size = len(tokenizer.word_index) + 1  # +1 for padding token
image_feature_size = 2048  # Size of the features from InceptionV3

# Load InceptionV3 model for image feature extraction
inception_model = InceptionV3(include_top=False, weights='imagenet')
inception_model = Model(inputs=inception_model.input, outputs=inception_model.output)

def extract_features(image_path):
    """Extract features from an image using InceptionV3."""
    image = tf.keras.preprocessing.image.load_img(image_path, target_size=(299, 299))
    image = tf.keras.preprocessing.image.img_to_array(image)
    image = np.expand_dims(image, axis=0)
    image = tf.keras.applications.inception_v3.preprocess_input(image)
    features = inception_model.predict(image, verbose=0)
    return features

# Step 4: Define the captioning model
image_input = tf.keras.Input(shape=(image_feature_size,))
image_features = Dense(embedding_dim, activation='relu')(image_input)

caption_input = tf.keras.Input(shape=(max_sequence_length,))
caption_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(caption_input)
caption_lstm = LSTM(256)(caption_embedding)

decoder_input = Add()([image_features, caption_lstm])
output = Dense(vocab_size, activation='softmax')(decoder_input)

caption_model = Model(inputs=[image_input, caption_input], outputs=output)

# Compile the model
caption_model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])

# Save the trained model in the Keras format
caption_model.save("caption_model.keras")
print("Model saved to caption_model.keras")

# Step 5: Example of how to train the model
# When you have image features and tokenized captions, you can train the model:
# X_image_features = np.array([...])  # Extracted features from images
# X_captions = pad_sequences([...], maxlen=max_sequence_length)  # Tokenized captions
# y_output = np.array([...])  # One-hot encoded labels for next word prediction

# Train the model (example code, not functional without actual data):
# caption_model.fit([X_image_features, X_captions], y_output, epochs=20, batch_size=64)


Tokenizer saved with 81 words.
Model saved to caption_model.keras


In [23]:
import numpy as np
import tensorflow as tf
import pickle
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GlobalAveragePooling2D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.preprocessing.image import load_img, img_to_array

# Load the tokenizer
with open("tokenizer.pkl", "rb") as file:
    tokenizer = pickle.load(file)

# Load the trained model
caption_model = tf.keras.models.load_model("caption_model.keras")

# Define parameters
max_sequence_length = 30  # Maximum length of captions
embedding_dim = 256
vocab_size = len(tokenizer.word_index) + 1  # +1 for padding token
image_feature_size = 2048  # Size of the features from InceptionV3

# Load InceptionV3 model for image feature extraction
inception_model = InceptionV3(include_top=False, weights='imagenet')
inception_model = Model(inputs=inception_model.input, outputs=inception_model.output)

# Apply Global Average Pooling to reduce the shape of the output to (1, 2048)
def extract_features(image_path):
    """Extract features from an image using InceptionV3."""
    image = load_img(image_path, target_size=(299, 299))  # Resize image to 299x299
    image = img_to_array(image)  # Convert to array
    image = np.expand_dims(image, axis=0)  # Add batch dimension
    image = tf.keras.applications.inception_v3.preprocess_input(image)  # Preprocess image for InceptionV3
    
    # Extract features from InceptionV3
    features = inception_model.predict(image, verbose=0)
    
    # Apply Global Average Pooling to reduce the feature map shape
    pooling_layer = GlobalAveragePooling2D()
    features = pooling_layer(features)
    
    # Now the features have shape (1, 2048)
    return features

def generate_caption(image_path, model, tokenizer, max_sequence_length):
    """Generate a caption for a given image."""
    # Extract image features
    image_features = extract_features(image_path)
    
    # Start the caption with the 'startseq' token
    caption = 'startseq'
    
    for _ in range(max_sequence_length):
        # Tokenize the caption
        sequence = tokenizer.texts_to_sequences([caption])
        sequence = pad_sequences(sequence, maxlen=max_sequence_length)
        
        # Predict the next word
        predicted_probs = model.predict([image_features, sequence], verbose=0)
        
        # Get the word index of the predicted word
        predicted_word_index = np.argmax(predicted_probs)
        
        # Get the word corresponding to the predicted index
        predicted_word = tokenizer.index_word.get(predicted_word_index, '')
        
        # If the predicted word is 'endseq', stop generating the caption
        if predicted_word == 'endseq':
            break
        
        # Append the predicted word to the caption
        caption += ' ' + predicted_word
    
    return caption

# Example: Predict a caption for a sample image
image_path = 'dog1.jpeg'  # Replace with your image path
generated_caption = generate_caption(image_path, caption_model, tokenizer, max_sequence_length)

# Print the generated caption
print("Generated Caption:", generated_caption)






Generated Caption: startseq relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing
