In [31]:
import torch
import torchvision.transforms as transforms
from PIL import Image
from torchvision.models import resnet50
from torch.nn import functional as F

In [23]:
# Load pre-trained ResNet model
model = resnet50(pretrained=True)
model.eval()

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to C:\Users\KIIT/.cache\torch\hub\checkpoints\resnet50-0676ba61.pth


  0%|          | 0.00/97.8M [00:00<?, ?B/s]

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [32]:
# Load the vocabulary used by the model
with open('vocabulary.txt', 'r') as f:
    vocabulary = f.read().splitlines()

In [33]:
# Define image transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])

In [34]:
# Function to generate captions for an image
def generate_captions(image_path, num_captions=1):
    image = Image.open(image_path).convert("RGB")
    image = transform(image).unsqueeze(0)

    # Run the image through the model
    features = model(image)
    features = features.unsqueeze(1)

    # Generate captions using beam search
    captions = beam_search(features, vocabulary, num_captions)

    return captions

In [35]:
# Function to perform beam search
def beam_search(features, vocabulary, num_captions, beam_width=5, max_length=20):
    start_token = vocabulary.index('<start>')
    end_token = vocabulary.index('<end>')
    beam_size = 1

    # Initialize the starting sequence
    sequences = [[start_token]]
    scores = [0.0]

    # Start generating captions
    for _ in range(max_length):
        all_candidates = []
        for i in range(beam_size):
            seq = sequences[i]
            score = scores[i]

            if seq[-1] == end_token:
                continue

            # Generate the next word predictions
            inputs = torch.tensor(seq).unsqueeze(0)
            inputs = F.pad(inputs, (0, max_length - inputs.size(1)), value=end_token)
            output = model.generate_captions(features, inputs)

            # Get the top-k predictions
            values, indices = torch.topk(output, beam_width, dim=2)
            values = values.squeeze(0).squeeze(0)
            indices = indices.squeeze(0).squeeze(0)

            for j in range(beam_width):
                candidate = seq + [indices[j].item()]
                score = score + values[j].item()

                all_candidates.append((candidate, score))

        # Sort the candidates by score
        ordered = sorted(all_candidates, key=lambda x: x[1], reverse=True)

        # Select the top candidates
        sequences = [x[0] for x in ordered[:beam_size]]
        scores = [x[1] for x in ordered[:beam_size]]

        # Check if any of the sequences have reached the end token
        if all(seq[-1] == end_token for seq in sequences):
            break
            
    # Convert the sequences to captions
    captions = []
    for seq in sequences:
        caption = ' '.join(vocabulary[index] for index in seq[1:-1])
        captions.append(caption)

    return captions

In [36]:
# Test the model with a single image
image_path = 'Image1.png'
captions = generate_captions(image_path)

# Print the generated captions
for caption in captions:
    print(caption)


AttributeError: 'ResNet' object has no attribute 'generate_captions'