<a href="https://colab.research.google.com/github/prithiba-A/CODSOFT/blob/main/Task_3_Image_captioning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
# Install necessary packages
# pip install torch torchvision nltk

import torch
import torch.nn as nn
import torchvision.models as models
from torch.nn.utils.rnn import pack_padded_sequence
from torchvision import transforms
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import string
import nltk

nltk.download('punkt')
nltk.download('stopwords')

# Placeholder for loading image data and captions
def load_data():
    # Replace this with your actual data loading logic
    # Return a tuple of (images, captions)
    pass

# Define the image preprocessing pipeline
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

# Define the encoder (VGG in this case)
class EncoderCNN(nn.Module):
    def __init__(self, embed_size):
        super(EncoderCNN, self).__init__()
        vgg = models.vgg16(pretrained=True)
        modules = list(vgg.children())[:-1]  # Remove the last fully connected layer
        self.vgg = nn.Sequential(*modules)
        self.embed = nn.Linear(vgg.classifier[-1].in_features, embed_size)

    def forward(self, images):
        features = self.vgg(images)
        features = features.view(features.size(0), -1)
        features = self.embed(features)
        return features

# Define the decoder (LSTM in this case)
class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers=1):
        super(DecoderRNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)

    def forward(self, features, captions, lengths):
        embeddings = self.embed(captions)
        embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
        packed = pack_padded_sequence(embeddings, lengths, batch_first=True)
        hiddens, _ = self.lstm(packed)
        outputs = self.linear(hiddens[0])
        return outputs


# Define the ImageCaptioning model
class ImageCaptioningModel(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers=1):
        super(ImageCaptioningModel, self).__init__()
        self.encoder = EncoderCNN(embed_size)
        self.decoder = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers)

    def forward(self, images, captions, lengths):
        features = self.encoder(images)
        features = features.unsqueeze(1)  # Add a sequence length dimension
        outputs = self.decoder(features, captions, lengths)
        return outputs


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
