In [1]:
from PIL import Image
import requests
import torch
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode
from transformers import BlipProcessor, BlipForConditionalGeneration
from torch.utils.data import Dataset
import os
import json
from torch.utils.data import DataLoader
from transformers import AdamW, CLIPProcessor, CLIPModel, GPT2LMHeadModel, GPT2Tokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cpu


In [3]:
annotations_folder = 'Dataset/Organized_Annotations/'
image_folder = 'Dataset/SSID_Images/'

In [4]:
image_paths = []
for i in range(1, 11):
    image_path = f"{image_folder}/{i}.jpg"
    image_paths.append(image_path)

In [5]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")



In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def generate_caption(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt").to(device)
    outputs = model.generate(**inputs)
    caption = processor.decode(outputs[0], skip_special_tokens=True)
    return caption

In [7]:
image_captions = {image_path: generate_caption(image_path) for image_path in image_paths}



In [8]:
for img, caption in image_captions.items():
    print(f"{img}: {caption}")

Dataset/SSID_Images//1.jpg: a group of people walking up a snowy slope
Dataset/SSID_Images//2.jpg: a person on a snowboard on a mountain
Dataset/SSID_Images//3.jpg: a man climbing up a snowy mountain
Dataset/SSID_Images//4.jpg: a man standing on top of a mountain
Dataset/SSID_Images//5.jpg: a man sitting on top of a snowy mountain
Dataset/SSID_Images//6.jpg: a man climbing up a mountain with a helmet on
Dataset/SSID_Images//7.jpg: a field with a fence and mountains in the background
Dataset/SSID_Images//8.jpg: a man with a backpack on a trail
Dataset/SSID_Images//9.jpg: the summit of the mountain is covered in snow
Dataset/SSID_Images//10.jpg: a man wearing a blue shirt


# Organising Image Label Data

In [9]:
with open('Dataset/Organized_Annotations/SSID_Train_Organized.json') as f:
    organized_data = json.load(f)

In [10]:
fine_tune_data = {}

album_limit = 100
processed_album_count = 0

In [11]:
for album_id in organized_data.items():
    if processed_album_count >= album_limit:
        break 

    album_id = album_id[1]

    # get the value of the first key in the dictionary
    stories = album_id[list(album_id.keys())[0]]

    for item in stories:

        # add a key value pair to the dictionary, key being the image_id and value being the storytext
        fine_tune_data[item['image_id']] = item['storytext']

In [12]:
print(len(fine_tune_data))

15625


In [13]:
sorted_fine_tune_data = dict(sorted(fine_tune_data.items(), key=lambda x: int(x[0])))

In [14]:
print(len(sorted_fine_tune_data))

15625


# Fine Tune BLIP

In [15]:
import torch
from torch.utils.data import Dataset
from PIL import Image

class ImageCaptionDataset(Dataset):
    def __init__(self, data_dict, image_folder, processor):
        """
        Args:
            data_dict (dict): A dictionary where keys are image names, and values are captions.
            image_folder (str): Path to the folder containing the images.
            processor (BlipProcessor): The processor to preprocess the images and captions.
        """
        self.data_dict = data_dict
        self.image_folder = image_folder
        self.processor = processor
        self.image_keys = list(data_dict.keys())
        
    def __len__(self):
        return len(self.image_keys)
    
    def __getitem__(self, idx):
        image_key = self.image_keys[idx]
        caption = self.data_dict[image_key]
        
        image_path = f"{self.image_folder}/{image_key}.jpg"  
        image = Image.open(image_path).convert('RGB')
        
        inputs = self.processor(images=image, text=caption, return_tensors="pt", padding="max_length", truncation=True)
        
        return {
            "input_ids": inputs['input_ids'].squeeze(),  
            "attention_mask": inputs['attention_mask'].squeeze(),  
            "pixel_values": inputs['pixel_values'].squeeze()  
        }

In [16]:
sorted_fine_tune_data = dict(list(sorted_fine_tune_data.items())[:8])

In [17]:
print(image_captions)

{'Dataset/SSID_Images//1.jpg': 'a group of people walking up a snowy slope', 'Dataset/SSID_Images//2.jpg': 'a person on a snowboard on a mountain', 'Dataset/SSID_Images//3.jpg': 'a man climbing up a snowy mountain', 'Dataset/SSID_Images//4.jpg': 'a man standing on top of a mountain', 'Dataset/SSID_Images//5.jpg': 'a man sitting on top of a snowy mountain', 'Dataset/SSID_Images//6.jpg': 'a man climbing up a mountain with a helmet on', 'Dataset/SSID_Images//7.jpg': 'a field with a fence and mountains in the background', 'Dataset/SSID_Images//8.jpg': 'a man with a backpack on a trail', 'Dataset/SSID_Images//9.jpg': 'the summit of the mountain is covered in snow', 'Dataset/SSID_Images//10.jpg': 'a man wearing a blue shirt'}


In [18]:
image_captions = {k.split('/')[-1].split('.')[0]: v for k, v in image_captions.items()}

print(image_captions)

{'1': 'a group of people walking up a snowy slope', '2': 'a person on a snowboard on a mountain', '3': 'a man climbing up a snowy mountain', '4': 'a man standing on top of a mountain', '5': 'a man sitting on top of a snowy mountain', '6': 'a man climbing up a mountain with a helmet on', '7': 'a field with a fence and mountains in the background', '8': 'a man with a backpack on a trail', '9': 'the summit of the mountain is covered in snow', '10': 'a man wearing a blue shirt'}


In [None]:
epochs = 3
learning_rate = 5e-5
batch_size = 5

dataset = ImageCaptionDataset(image_captions, image_folder, processor)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

optimizer = AdamW(model.parameters(), lr=learning_rate)

model.train()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    
    for batch in data_loader:

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        pixel_values = batch['pixel_values'].to(device)

        outputs = model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        print(f"Loss: {loss.item()}")



Epoch 1/3


: 

In [None]:
captions = []
image_paths = []

for i in range(11, 111):
    image_path = f"{image_folder}/{i}.jpg"
    image_paths.append(image_path)