# Importing Libraries

In [3]:
from PIL import Image
import requests
import torch
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode
from transformers import BlipProcessor, BlipForConditionalGeneration
from torch.utils.data import Dataset
import os
import json
from torch.utils.data import DataLoader
from transformers import AdamW, CLIPProcessor, CLIPModel, GPT2LMHeadModel, GPT2Tokenizer

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
# Load CLIP and GPT-2 models
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") 
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
clip_model.to(device)
gpt2_model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [7]:
def generate_caption_with_clip_gpt2(image_path):
    # Load and process image
    image = Image.open(image_path).convert("RGB")
    inputs = clip_processor(images=image, return_tensors="pt").to(device)

    # Get CLIP image embeddings
    with torch.no_grad():
        image_features = clip_model.get_image_features(**inputs)

    # Normalize image features and prepare input for GPT-2
    image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
    image_features_text = image_features.detach().cpu().numpy().astype(str)

    # Use GPT-2 to generate a caption from the image features
    input_ids = gpt2_tokenizer.encode("Image caption: ", return_tensors="pt").to(device)
    outputs = gpt2_model.generate(input_ids, max_length=50, do_sample=True)

    caption = gpt2_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return caption

In [8]:
annotations_folder = 'Dataset/Organized_Annotations/'
image_folder = 'Dataset/SSID_Images/'

In [20]:
image_paths = []
for i in range(1, 11):
    image_path = f"{image_folder}/{i}.jpg"
    image_paths.append(image_path)

In [21]:
captions = {image_path: generate_caption_with_clip_gpt2(image_path) for image_path in image_paths}

for img, caption in captions.items():
    print(f"{img}: {caption}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

Dataset/SSID_Images//1.jpg: Image caption:  Rationalist - he has no understanding of the concept of money

In an article in the Mirror, Mr Smith writes: "I'm really confused because I thought this would help people understand the issue, if there was
Dataset/SSID_Images//2.jpg: Image caption:  A new research study suggests that the way our bodies work may need more research

The study is published in the European Journal of Nursing Science paper, the third published in two weeks.

Dr Susan Eger, an
Dataset/SSID_Images//3.jpg: Image caption:  Anastasia de Valta: "If you take a look at the map we have, you will start to see where the island should lie and where we should go".

In March 2011, in an online
Dataset/SSID_Images//4.jpg: Image caption:  It will run at 15mph in 4.5sec


Brief news - The BBC's Jonathan Head has had the first ever view of Anderton's new new electric car. It's like flying into a hurricane
Dataset/SSID_Images//5.jpg: Image caption:  A view from the bottom of the lak

In [25]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def generate_caption(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt").to(device)
    outputs = model.generate(**inputs)
    caption = processor.decode(outputs[0], skip_special_tokens=True)
    return caption

In [27]:
image_captions = {image_path: generate_caption(image_path) for image_path in image_paths}



In [28]:
for img, caption in image_captions.items():
    print(f"{img}: {caption}")

Dataset/SSID_Images//1.jpg: a group of people walking up a snowy slope
Dataset/SSID_Images//2.jpg: a person on a snowboard on a mountain
Dataset/SSID_Images//3.jpg: a man climbing up a snowy mountain
Dataset/SSID_Images//4.jpg: a man standing on top of a mountain
Dataset/SSID_Images//5.jpg: a man sitting on top of a snowy mountain
Dataset/SSID_Images//6.jpg: a man climbing up a mountain with a helmet on
Dataset/SSID_Images//7.jpg: a field with a fence and mountains in the background
Dataset/SSID_Images//8.jpg: a man with a backpack on a trail
Dataset/SSID_Images//9.jpg: the summit of the mountain is covered in snow
Dataset/SSID_Images//10.jpg: a man wearing a blue shirt


# Organising the Image-Label Data

In [29]:
with open('Dataset/Organized_Annotations/SSID_Train_Organized.json') as f:
    organized_data = json.load(f)

In [None]:
fine_tune_data = {}

album_limit = 200
processed_album_count = 0

In [31]:
for album_id in organized_data.items():
    if processed_album_count >= album_limit:
        break 

    album_id = album_id[1]

    # get the value of the first key in the dictionary
    stories = album_id[list(album_id.keys())[0]]

    for item in stories:

        # add a key value pair to the dictionary, key being the image_id and value being the storytext
        fine_tune_data[item['image_id']] = item['storytext']

In [32]:
print(len(fine_tune_data))

15625


In [33]:
#print(fine_tune_data)

In [34]:
sorted_fine_tune_data = dict(sorted(fine_tune_data.items(), key=lambda x: int(x[0])))

#print(sorted_fine_tune_data)

# Fine Tuning BLIP

In [35]:
import torch
from torch.utils.data import Dataset
from PIL import Image

class ImageCaptionDataset(Dataset):
    def __init__(self, data_dict, image_folder, processor):
        """
        Args:
            data_dict (dict): A dictionary where keys are image names, and values are captions.
            image_folder (str): Path to the folder containing the images.
            processor (BlipProcessor): The processor to preprocess the images and captions.
        """
        self.data_dict = data_dict
        self.image_folder = image_folder
        self.processor = processor
        self.image_keys = list(data_dict.keys())
        
    def __len__(self):
        return len(self.image_keys)
    
    def __getitem__(self, idx):
        image_key = self.image_keys[idx]
        caption = self.data_dict[image_key]
        
        image_path = f"{self.image_folder}/{image_key}.jpg"  
        image = Image.open(image_path).convert('RGB')
        
        inputs = self.processor(images=image, text=caption, return_tensors="pt", padding="max_length", truncation=True)
        
        return {
            "input_ids": inputs['input_ids'].squeeze(),  
            "attention_mask": inputs['attention_mask'].squeeze(),  
            "pixel_values": inputs['pixel_values'].squeeze()  
        }

In [None]:
sorted_fine_tune_data = dict(list(sorted_fine_tune_data.items())[:1000])

In [39]:
print(image_captions)

{'Dataset/SSID_Images//1.jpg': 'a group of people walking up a snowy slope', 'Dataset/SSID_Images//2.jpg': 'a person on a snowboard on a mountain', 'Dataset/SSID_Images//3.jpg': 'a man climbing up a snowy mountain', 'Dataset/SSID_Images//4.jpg': 'a man standing on top of a mountain', 'Dataset/SSID_Images//5.jpg': 'a man sitting on top of a snowy mountain', 'Dataset/SSID_Images//6.jpg': 'a man climbing up a mountain with a helmet on', 'Dataset/SSID_Images//7.jpg': 'a field with a fence and mountains in the background', 'Dataset/SSID_Images//8.jpg': 'a man with a backpack on a trail', 'Dataset/SSID_Images//9.jpg': 'the summit of the mountain is covered in snow', 'Dataset/SSID_Images//10.jpg': 'a man wearing a blue shirt'}


In [42]:
image_captions = {k.split('/')[-1].split('.')[0]: v for k, v in image_captions.items()}

print(image_captions)

{'1': 'a group of people walking up a snowy slope', '2': 'a person on a snowboard on a mountain', '3': 'a man climbing up a snowy mountain', '4': 'a man standing on top of a mountain', '5': 'a man sitting on top of a snowy mountain', '6': 'a man climbing up a mountain with a helmet on', '7': 'a field with a fence and mountains in the background', '8': 'a man with a backpack on a trail', '9': 'the summit of the mountain is covered in snow', '10': 'a man wearing a blue shirt'}


In [43]:
epochs = 3
learning_rate = 5e-5
batch_size = 10

dataset = ImageCaptionDataset(image_captions, image_folder, processor)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

optimizer = AdamW(model.parameters(), lr=learning_rate)

model.train()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    
    for batch in data_loader:

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        pixel_values = batch['pixel_values'].to(device)

        outputs = model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        print(f"Loss: {loss.item()}")



Epoch 1/3
Loss: 13.018089294433594
Epoch 2/3
Loss: 11.234161376953125
Epoch 3/3
Loss: 9.930630683898926


In [50]:
captions = []
image_paths = []

for i in range(11, 111):
    image_path = f"{image_folder}/{i}.jpg"
    image_paths.append(image_path)

In [51]:
print(image_paths)

['Dataset/SSID_Images//11.jpg', 'Dataset/SSID_Images//12.jpg', 'Dataset/SSID_Images//13.jpg', 'Dataset/SSID_Images//14.jpg', 'Dataset/SSID_Images//15.jpg', 'Dataset/SSID_Images//16.jpg', 'Dataset/SSID_Images//17.jpg', 'Dataset/SSID_Images//18.jpg', 'Dataset/SSID_Images//19.jpg', 'Dataset/SSID_Images//20.jpg', 'Dataset/SSID_Images//21.jpg', 'Dataset/SSID_Images//22.jpg', 'Dataset/SSID_Images//23.jpg', 'Dataset/SSID_Images//24.jpg', 'Dataset/SSID_Images//25.jpg', 'Dataset/SSID_Images//26.jpg', 'Dataset/SSID_Images//27.jpg', 'Dataset/SSID_Images//28.jpg', 'Dataset/SSID_Images//29.jpg', 'Dataset/SSID_Images//30.jpg', 'Dataset/SSID_Images//31.jpg', 'Dataset/SSID_Images//32.jpg', 'Dataset/SSID_Images//33.jpg', 'Dataset/SSID_Images//34.jpg', 'Dataset/SSID_Images//35.jpg', 'Dataset/SSID_Images//36.jpg', 'Dataset/SSID_Images//37.jpg', 'Dataset/SSID_Images//38.jpg', 'Dataset/SSID_Images//39.jpg', 'Dataset/SSID_Images//40.jpg', 'Dataset/SSID_Images//41.jpg', 'Dataset/SSID_Images//42.jpg', 'Datase

In [52]:
def preprocess_image(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt").to(device)
    return inputs

In [53]:
processed_images = [preprocess_image(image_path) for image_path in image_paths]

In [54]:
captions = []

for inputs in processed_images:
    outputs = model.generate(**inputs)
    caption = processor.decode(outputs[0], skip_special_tokens=True)
    captions.append(caption)

In [58]:
print(captions)

['a woman in a green jacket and black pants holding a snowboard', 'a group of people walking down a dirt covered road', 'a skier skiing down a snowy covered mountain', 'a skier skiing down a snowy covered mountain', 'a skier skiing down a snowy covered mountain', 'a skier skiing down a snowy covered mountain', "the cover of the book, the mountaineers's guide to the ski resort", 'a skier skiing down a snowy covered mountain', 'a group of people riding ski boards down a snow covered slope', 'a skier on a ski slope with a ski board in his hand', 'a skier skiing down a snowy covered mountain', 'a group of people riding ski boards down a snow covered slope', 'a group of people riding ski boards down a snow covered slope', "the cover of the album,'the mountain '", 'a skier makes a turn on a snowy slope in the ski area of the ski resort in whistle', 'a man sitting on a bench in the grass', 'a woman walking through a field of flowers', 'a woman sitting at a table with a whiteboard in front of 

In [60]:
for i, caption in enumerate(captions, start=10):
    print(f"Caption for image {i+1}: {caption}")
 

Caption for image 11: a woman in a green jacket and black pants holding a snowboard
Caption for image 12: a group of people walking down a dirt covered road
Caption for image 13: a skier skiing down a snowy covered mountain
Caption for image 14: a skier skiing down a snowy covered mountain
Caption for image 15: a skier skiing down a snowy covered mountain
Caption for image 16: a skier skiing down a snowy covered mountain
Caption for image 17: the cover of the book, the mountaineers's guide to the ski resort
Caption for image 18: a skier skiing down a snowy covered mountain
Caption for image 19: a group of people riding ski boards down a snow covered slope
Caption for image 20: a skier on a ski slope with a ski board in his hand
Caption for image 21: a skier skiing down a snowy covered mountain
Caption for image 22: a group of people riding ski boards down a snow covered slope
Caption for image 23: a group of people riding ski boards down a snow covered slope
Caption for image 24: the c