In [1]:
from PIL import Image
import requests
import torch
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode
from transformers import BlipProcessor, BlipForConditionalGeneration
from torch.utils.data import Dataset
import os
import json
from torch.utils.data import DataLoader
from transformers import AdamW, CLIPProcessor, CLIPModel, GPT2LMHeadModel, GPT2Tokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cpu


In [3]:
annotations_folder = 'Dataset/Organized_Annotations/'
image_folder = 'Dataset/SSID_Images/'

In [4]:
image_paths = []
for i in range(1, 1001):
    image_path = f"{image_folder}/{i}.jpg"
    image_paths.append(image_path)

In [5]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")



In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def generate_caption(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt").to(device)
    outputs = model.generate(**inputs)
    caption = processor.decode(outputs[0], skip_special_tokens=True)
    return caption

In [8]:
image_captions = {image_path: generate_caption(image_path) for image_path in image_paths}



In [9]:
for img, caption in image_captions.items():
    print(f"{img}: {caption}")

Dataset/SSID_Images//1.jpg: a group of people walking up a snowy slope
Dataset/SSID_Images//2.jpg: a person on a snowboard on a mountain
Dataset/SSID_Images//3.jpg: a man climbing up a snowy mountain
Dataset/SSID_Images//4.jpg: a man standing on top of a mountain
Dataset/SSID_Images//5.jpg: a man sitting on top of a snowy mountain
Dataset/SSID_Images//6.jpg: a man climbing up a mountain with a helmet on
Dataset/SSID_Images//7.jpg: a field with a fence and mountains in the background
Dataset/SSID_Images//8.jpg: a man with a backpack on a trail
Dataset/SSID_Images//9.jpg: the summit of the mountain is covered in snow
Dataset/SSID_Images//10.jpg: a man wearing a blue shirt
Dataset/SSID_Images//11.jpg: a woman is holding a surfboard in her hand
Dataset/SSID_Images//12.jpg: a group of people walking on a trail
Dataset/SSID_Images//13.jpg: a man standing on top of a mountain
Dataset/SSID_Images//14.jpg: a man standing on top of a mountain
Dataset/SSID_Images//15.jpg: a man is skiing down a s

In [10]:
import csv
csv_file = 'image_captions.csv'

# Write the dictionary to a CSV file
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(['img', 'caption'])
    # Write each item in the dictionary
    for img, caption in image_captions.items():
        writer.writerow([img, caption])

print(f"Data successfully saved to {csv_file}")


Data successfully saved to image_captions.csv
