In [1]:
import numpy as np
import pandas as pd
import os
import random
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
from torchvision.models import resnet50
from torchvision import transforms
from PIL import Image
from transformers import GPT2Tokenizer, GPT2LMHeadModel


In [11]:
# Define paths
images_folder = r"Images"
captions_file = r"captions.txt"

# Read captions from the file
with open(captions_file, 'r') as file:
    captions = file.readlines()

# Display first few image paths and captions
for i, filename in enumerate(os.listdir(images_folder)):
    image_path = os.path.join(images_folder, filename)
    caption = captions[i].strip() if i < len(captions) else "No caption available"
    print(f"Image Path: {image_path}")
    print(f"Caption: {caption}")
    print()
    if i == 5:
        break


Image Path: Images/1000268201_693b08cb0e.jpg
Caption: image,caption

Image Path: Images/1001773457_577c3a7d70.jpg
Caption: 1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set of stairs in an entry way .

Image Path: Images/1002674143_1b742ab4b8.jpg
Caption: 1000268201_693b08cb0e.jpg,A girl going into a wooden building .

Image Path: Images/1003163366_44323f5815.jpg
Caption: 1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .

Image Path: Images/1007129816_e794419615.jpg
Caption: 1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playhouse .

Image Path: Images/1007320043_627395c3d8.jpg
Caption: 1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a wooden cabin .



In [12]:
# Load pre-trained GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [13]:
# Define the transformation for the dataset
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

# Define Flickr8kDataset
class Flickr8kDataset(Dataset):
    def __init__(self, annotations_file, img_dir, tokenizer, transform=None):
        self.img_dir = img_dir
        self.transform = transform
        self.img_captions = pd.read_csv(annotations_file)
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.img_captions)//5

    def __getitem__(self, idx):
        file_name = self.img_captions.iloc[5*idx, 0]
        img_path = os.path.join(self.img_dir, file_name)
        image = Image.open(img_path)
        caption = random.choice(self.img_captions.iloc[5*idx : 5*(idx+1), 1].tolist())
        tokenized_caption = self.tokenizer.encode(caption)
        if self.transform:
            image = self.transform(image)
        return image, tokenized_caption

# Initialize dataset and dataloaders
dataset = Flickr8kDataset(annotations_file=captions_file, img_dir=images_folder, tokenizer=tokenizer, transform=transform)
split_point = int(0.9*len(dataset))
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [split_point, len(dataset) - split_point])

def collate_fn(data):
    images, captions = zip(*data)
    images = torch.stack(images, 0)
    captions = [[tokenizer.bos_token_id] + cap + [tokenizer.eos_token_id] for cap in captions]
    lengths = [len(cap) for cap in captions]
    targets = torch.zeros(len(captions), max(lengths), dtype=torch.long)
    masks = torch.zeros(len(captions), max(lengths), dtype=torch.long)
    for i, cap in enumerate(captions):
        end = lengths[i]
        targets[i, :end] = torch.LongTensor(cap)
        masks[i, :end] = 1
    return images, targets, masks

train_dataloader = DataLoader(train_dataset, batch_size=32, collate_fn=collate_fn, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, collate_fn=collate_fn, shuffle=False)


In [5]:
import torch
import torch.nn as nn
from torch.optim import AdamW
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
from torchvision import models
from torch.utils.data import DataLoader, Dataset

# Define your device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pre-trained ResNet
resnet = models.resnet50(pretrained=True)
resnet = nn.Sequential(*list(resnet.children())[:-2])  # Remove last fully connected layer and avgpool
resnet.to(device)
resnet.eval()  # Set ResNet to evaluation mode

# Load pre-trained GPT-2
gpt2 = GPT2LMHeadModel.from_pretrained('gpt2')
gpt2.to(device)

# ImageCaptioningModel definition
class ImageCaptioningModel(nn.Module):
    def __init__(self, resnet, gpt2):
        super(ImageCaptioningModel, self).__init__()  # Corrected super call
        self.resnet = resnet
        self.gpt2 = gpt2
        self.proj = nn.Linear(2048, gpt2.config.hidden_size)

    def forward(self, images, input_ids, attention_mask=None):
        img_features = self.resnet(images)
        img_features = img_features.mean([2, 3])
        img_features = self.proj(img_features)
        input_embeddings = self.gpt2.transformer.wte(input_ids)
        combined_embeddings = input_embeddings + img_features.unsqueeze(1)
        outputs = self.gpt2(inputs_embeds=combined_embeddings, attention_mask=attention_mask, labels=input_ids)
        return outputs

# Training setup
model = ImageCaptioningModel(resnet, gpt2)
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 25
print_every = 50

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    iteration_loss = 0

    for idx, batch in enumerate(train_dataloader, 1):
        batch = tuple(t.to(device) for t in batch)
        images, input_ids, masks = batch
        optimizer.zero_grad()
        outputs = model(images, input_ids, attention_mask=masks)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        iteration_loss += loss.item()
        if idx % print_every == 0:
            print(f"Epoch {epoch + 1}/{num_epochs} | Iteration {idx}/{len(train_dataloader)} | Training Loss: {iteration_loss / print_every:.3f}")
            iteration_loss = 0

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            batch = tuple(t.to(device) for t in batch)
            images, input_ids, masks = batch
            outputs = model(images, input_ids, attention_mask=masks)
            loss = outputs.loss
            val_loss += loss.item()
    val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs} | Training Loss: {total_loss / len(train_dataloader):.3f} | Validation Loss: {val_loss:.3f}\n")




Epoch 1/25 | Iteration 50/228 | Training Loss: 4.220
Epoch 1/25 | Iteration 100/228 | Training Loss: 2.326
Epoch 1/25 | Iteration 150/228 | Training Loss: 1.811
Epoch 1/25 | Iteration 200/228 | Training Loss: 1.670
Epoch 1/25 | Training Loss: 2.402 | Validation Loss: 1.610

Epoch 2/25 | Iteration 50/228 | Training Loss: 1.592
Epoch 2/25 | Iteration 100/228 | Training Loss: 1.600
Epoch 2/25 | Iteration 150/228 | Training Loss: 1.625
Epoch 2/25 | Iteration 200/228 | Training Loss: 1.537
Epoch 2/25 | Training Loss: 1.586 | Validation Loss: 1.561

Epoch 3/25 | Iteration 50/228 | Training Loss: 1.598
Epoch 3/25 | Iteration 100/228 | Training Loss: 1.506
Epoch 3/25 | Iteration 150/228 | Training Loss: 1.557
Epoch 3/25 | Iteration 200/228 | Training Loss: 1.544
Epoch 3/25 | Training Loss: 1.544 | Validation Loss: 1.476

Epoch 4/25 | Iteration 50/228 | Training Loss: 1.497
Epoch 4/25 | Iteration 100/228 | Training Loss: 1.488
Epoch 4/25 | Iteration 150/228 | Training Loss: 1.488
Epoch 4/25 | I

In [7]:

torch.save(model.state_dict(), 'image_captioning_model.pth')

In [4]:
import torch
from PIL import Image
import matplotlib.pyplot as plt
from torchvision import transforms, models
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch.nn as nn
import gradio as gr

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2 = GPT2LMHeadModel.from_pretrained('gpt2')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the transformation for the input image
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

# Load the pretrained ResNet model
resnet = models.resnet50(pretrained=True)
resnet = nn.Sequential(*list(resnet.children())[:-2])
resnet.eval()
resnet.to(device)

# Define the image captioning model
class ImageCaptioningModel(nn.Module):
    def __init__(self, resnet, gpt2):
        super(ImageCaptioningModel, self).__init__()
        self.resnet = resnet
        self.gpt2 = gpt2
        self.proj = nn.Linear(2048, gpt2.config.hidden_size)

    def forward(self, images, input_ids, attention_mask=None):
        img_features = self.resnet(images)
        img_features = img_features.mean([2, 3])
        img_features = self.proj(img_features)
        input_embeddings = self.gpt2.transformer.wte(input_ids)
        combined_embeddings = input_embeddings + img_features.unsqueeze(1)
        outputs = self.gpt2(inputs_embeds=combined_embeddings, attention_mask=attention_mask, labels=input_ids)
        return outputs

# Initialize the image captioning model and load the state dictionary
model = ImageCaptioningModel(resnet, gpt2)
model.load_state_dict(torch.load(r"image_captioning_model.pth", map_location=device))
model.to(device)
model.eval()

# Function to generate caption
def generate_caption(image, model, tokenizer, max_length=50, temperature=1.0):
    caption = [tokenizer.bos_token_id]
    model.eval()
    with torch.no_grad():
        for i in range(max_length):
            input_ids = torch.LongTensor(caption).unsqueeze(0).to(device)
            outputs = model(image.unsqueeze(0), input_ids)
            logits = outputs.logits[:, -1, :] / temperature
            predicted_id = logits.argmax(1).item()
            caption.append(predicted_id)
            if predicted_id == tokenizer.eos_token_id and i > 1:
                break
    generated_caption = tokenizer.decode(caption, skip_special_tokens=True)
    return generated_caption

# Function to generate caption for an image file
def generate_caption_for_image(image_path, model, tokenizer, transform, max_length=50, temperature=1.0):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)
    caption = generate_caption(image.squeeze(0), model, tokenizer, max_length, temperature)
    return caption

# Function to create a Gradio interface
def main():
    def caption_interface(image):
        generated_caption = generate_caption_for_image(image, model, tokenizer, transform)
        return generated_caption

    interface = gr.Interface(
        fn=caption_interface,
        inputs=gr.Image(type="filepath"),
        outputs=gr.Textbox(),
        title="Image Captioning",
        description="Upload an image to generate a caption using a trained model."
    )

    interface.launch()

if __name__ == "__main__":
    main()



Running on local URL:  http://127.0.0.1:7864

To create a public link, set `share=True` in `launch()`.
