In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, concatenate
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import json
import os
from PIL import Image
import pickle

In [None]:
# Configuration
max_length = 50  # Maximum length of captions
vocab_size = 10000  # Vocabulary size
embedding_dim = 256
units = 512
epochs = 20
batch_size = 32

In [None]:
# Load COCO dataset (assuming you have the dataset downloaded)
# For simplicity, this is a placeholder. Replace with actual data loading.
def load_coco_data(data_dir):
    # Placeholder for loading images and captions
    # In practice, use pycocotools or similar
    # images = []  # List of image features (e.g., from CNN)
    # captions = []  # List of tokenized captions
    # Example: Load from annotations.json
    with open(os.path.join(data_dir, 'annotations/captions_train2017.json'), 'r') as f:
        annotations = json.load(f)
    
    # Extract captions and image paths
    captions_data = annotations['annotations']
    images_data = annotations['images']
    image_id_to_path = {img['id']: img['file_name'] for img in images_data}
    
    captions = []
    images = []
    for ann in captions_data:
        caption = ann['caption']
        image_id = ann['image_id']
        image_path = os.path.join(data_dir, 'train2017', image_id_to_path[image_id])
        if os.path.exists(image_path):
            images.append(image_path)
            captions.append(caption)
    
    return images, captions

# Preprocess images (extract features using a CNN)
def preprocess_images(image_paths):
    model = tf.keras.applications.InceptionV3(weights='imagenet', include_top=False, pooling='avg')
    features = []
    for path in image_paths:
        img = Image.open(path).resize((299, 299))
        img = np.array(img) / 255.0
        img = np.expand_dims(img, axis=0)
        feature = model.predict(img)
        features.append(feature.flatten())
    return np.array(features)

# Tokenize captions
def tokenize_captions(captions, max_length, vocab_size):
    tokenizer = Tokenizer(num_words=vocab_size, oov_token='<unk>')
    tokenizer.fit_on_texts(captions)
    sequences = tokenizer.texts_to_sequences(captions)
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')
    return tokenizer, padded_sequences

# Define the model
def create_model(vocab_size, embedding_dim, max_length, units):
    # Image feature input
    image_input = Input(shape=(2048,))  # Assuming InceptionV3 features
    image_dense = Dense(units, activation='relu')(image_input)
    image_dropout = Dropout(0.5)(image_dense)
    
    # Caption input
    caption_input = Input(shape=(max_length,))
    caption_embedding = Embedding(vocab_size, embedding_dim, input_length=max_length)(caption_input)
    caption_lstm = LSTM(units, return_sequences=False)(caption_embedding)
    caption_dropout = Dropout(0.5)(caption_lstm)
    
    # Combine
    combined = concatenate([image_dropout, caption_dropout])
    output = Dense(vocab_size, activation='softmax')(combined)
    
    model = Model(inputs=[image_input, caption_input], outputs=output)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Main training function
def train_model(data_dir):
    # Load data
    image_paths, captions = load_coco_data(data_dir)
    
    # Preprocess
    image_features = preprocess_images(image_paths[:1000])  # Subset for example
    tokenizer, padded_captions = tokenize_captions(captions[:1000], max_length, vocab_size)
    
    # Prepare targets (shifted captions for training)
    X_images = image_features
    X_captions = padded_captions[:, :-1]
    y = to_categorical(padded_captions[:, 1:], num_classes=vocab_size)
    
    # Create model
    model = create_model(vocab_size, embedding_dim, max_length-1, units)
    
    # Train
    model.fit([X_images, X_captions], y, epochs=epochs, batch_size=batch_size, validation_split=0.2)
    
    # Save model and tokenizer
    model.save('image_caption_model.h5')
    with open('tokenizer.pickle', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    print("Model trained and saved.")

In [None]:
# Run training (replace 'path/to/coco' with actual path)
train_model('path/to/coco')  # Update this with your COCO dataset path

# Note: This is a basic example. Ensure you have the COCO dataset and install required packages: pip install tensorflow pillow numpy