# Importing Libraries

In [1]:
from PIL import Image
import requests
import torch
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode
from transformers import BlipProcessor, BlipForConditionalGeneration
from torch.utils.data import Dataset
import os
import json
from torch.utils.data import DataLoader
from transformers import AdamW

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")



# Organising the Image-Label Data

In [4]:
annotations_folder = 'Dataset/Organized_Annotations/'

In [5]:
image_folder = 'Dataset/SSID_Images/'

In [6]:
with open('Dataset/Organized_Annotations/SSID_Train_Organized.json') as f:
    organized_data = json.load(f)

In [7]:
fine_tune_data = {}

album_limit = 100
processed_album_count = 0

In [8]:
for album_id in organized_data.items():
    if processed_album_count >= album_limit:
        break  # Stop after processing 100 albums

    album_id = album_id[1]

    # get the value of the first key in the dictionary
    stories = album_id[list(album_id.keys())[0]]

    for item in stories:

        # add a key value pair to the dictionary, key being the image_id and value being the storytext
        fine_tune_data[item['image_id']] = item['storytext']

In [9]:
print(len(fine_tune_data))

15625


In [10]:
#print(fine_tune_data)

In [11]:
sorted_fine_tune_data = dict(sorted(fine_tune_data.items(), key=lambda x: int(x[0])))

#print(sorted_fine_tune_data)

# Fine Tuning BLIP

In [12]:
import torch
from torch.utils.data import Dataset
from PIL import Image

class ImageCaptionDataset(Dataset):
    def __init__(self, data_dict, image_folder, processor):
        """
        Args:
            data_dict (dict): A dictionary where keys are image names, and values are captions.
            image_folder (str): Path to the folder containing the images.
            processor (BlipProcessor): The processor to preprocess the images and captions.
        """
        self.data_dict = data_dict
        self.image_folder = image_folder
        self.processor = processor
        self.image_keys = list(data_dict.keys())
        
    def __len__(self):
        return len(self.image_keys)
    
    def __getitem__(self, idx):
        image_key = self.image_keys[idx]
        caption = self.data_dict[image_key]
        
        # Load the image
        image_path = f"{self.image_folder}/{image_key}.jpg"  # Assumes image is stored as jpg
        image = Image.open(image_path).convert('RGB')
        
        # Preprocess the image and caption
        inputs = self.processor(images=image, text=caption, return_tensors="pt", padding="max_length", truncation=True)
        
        return {
            "input_ids": inputs['input_ids'].squeeze(),  # Caption tokens
            "attention_mask": inputs['attention_mask'].squeeze(),  # Attention mask for the caption
            "pixel_values": inputs['pixel_values'].squeeze()  # Processed image tensor
        }

In [13]:
# get only first 100 images for fine tuning
sorted_fine_tune_data = dict(list(sorted_fine_tune_data.items())[:8])

In [14]:
print(len(sorted_fine_tune_data))

8


In [15]:
# Hyperparameters

epochs = 3
learning_rate = 5e-5
batch_size = 8

# Create the dataset and data loader
dataset = ImageCaptionDataset(sorted_fine_tune_data, image_folder, processor)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Set up optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Set the model in training mode
model.train()

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Fine-tuning loop
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    
    for batch in data_loader:
        # Move the data to the GPU if available
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        pixel_values = batch['pixel_values'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        print(f"Loss: {loss.item()}")

Epoch 1/3




Loss: 13.1148042678833
Epoch 2/3
Loss: 11.080038070678711
Epoch 3/3
Loss: 10.039155006408691


In [30]:
captions = []
image_paths = []

# Load the images
for i in range(6, 11):
    image_path = f"{image_folder}/{i}.jpg"
    image_paths.append(image_path)

In [31]:
print(image_paths)

['Dataset/SSID_Images//6.jpg', 'Dataset/SSID_Images//7.jpg', 'Dataset/SSID_Images//8.jpg', 'Dataset/SSID_Images//9.jpg', 'Dataset/SSID_Images//10.jpg']


In [32]:
def preprocess_image(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt").to(device)
    return inputs

In [33]:
processed_images = [preprocess_image(image_path) for image_path in image_paths]

In [34]:
captions = []

for inputs in processed_images:
    outputs = model.generate(**inputs)
    caption = processor.decode(outputs[0], skip_special_tokens=True)
    captions.append(caption)

In [35]:
for i, caption in enumerate(captions):
    print(f"Caption for image {i+1}: {caption}")

Caption for image 1: a man in a blue jacket and helmet walks through a narrow ravine
Caption for image 2: a horse in a field with the words,'the best horse is a horse '
Caption for image 3: a man and woman walking down a dirt road
Caption for image 4: the cover of the book, the book of the year, with a photo of a mountain range
Caption for image 5: a man in a blue shirt and hat walks down the street
