## BLIP (Pre-trained)

In [1]:
import pandas as pd
import re
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch
from tqdm import tqdm
import os

def clean_caption(caption):
    caption = caption.lower()
    caption = re.sub(r"figure\s+\d+(\.\d+)*", "", caption)
    caption = re.sub(r"[^a-z0-9\s]", "", caption)
    caption = re.sub(r"\s+", " ", caption).strip()
    return caption


processor = BlipProcessor.from_pretrained('Salesforce/blip-image-captioning-base')
model = BlipForConditionalGeneration.from_pretrained('Salesforce/blip-image-captioning-base')


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

captions_df = pd.read_csv('captions.csv') 


captions_df['clean_caption'] = captions_df['full_caption'].apply(clean_caption)


image_folder = 'ImageList'

def verify_image_exists(image_id):
    return os.path.isfile(os.path.join(image_folder, str(image_id)+".png"))

captions_df['image_exists'] = captions_df['imageid'].apply(verify_image_exists)
captions_df = captions_df[captions_df['image_exists']]

print(f"Total entries after verification: {len(captions_df)}")


def generate_caption(image_path):

    try:
        image = Image.open(image_path).convert('RGB')
    except Exception as e:
        print(f"Error loading image {image_path}: {e}")
        return ""

    inputs = processor(images=image, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}


    output_ids = model.generate(**inputs, max_length=50, num_beams=5, early_stopping=True)


    caption = processor.decode(output_ids[0], skip_special_tokens=True)

    return caption

single_caption = generate_caption("ImageList/7.png")
print("Caption: ",single_caption)


Total entries after verification: 977
Caption:  a plot with a plot plot and a plot plot plot


In [2]:
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def generate_caption_blip(image_path):
    image = Image.open(image_path).convert('RGB')
    inputs = processor(image, return_tensors="pt").to(device)
    output_ids = model.generate(**inputs, max_new_tokens=200,top_k=50,top_p=0.95,do_sample=True)
    caption = processor.decode(output_ids[0], skip_special_tokens=True)
    return caption

test_image_path = "ImageList/7.png"
generated_caption = generate_caption_blip(test_image_path)
print(f"Generated Caption (Pretrained BLIP): {generated_caption}")


Generated Caption (Pretrained BLIP): figure 2 for the predicted motion of an orbitale at three axes


model.safetensors:  68%|######7   | 671M/990M [00:00<?, ?B/s]

## BLIP (Fine-tuned)

In [3]:
from torch.utils.data import Dataset
import os
import pandas as pd
from PIL import Image

def clean_caption(caption):
    caption = caption.lower()
    caption = re.sub(r'\b(figure|fig)\s*\d+(\.\d+)?', '', caption, flags=re.IGNORECASE)
    caption = re.sub(r"[^a-z0-9A-Z\s]", "", caption)
    caption = re.sub(r"\s+", " ", caption).strip()
    return caption

class BlipDataset(Dataset):
    def __init__(self, csv_file, img_folder, processor):
        self.data = pd.read_csv(csv_file)
        self.data['full_caption'] = self.data['full_caption'].apply(clean_caption)
        self.img_folder = img_folder
        self.processor = processor

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_id = self.data.iloc[idx]['imageid']
        caption = self.data.iloc[idx]['full_caption']
        img_path = os.path.join(self.img_folder, f"{img_id}.png")
        image = Image.open(img_path).convert('RGB')
        inputs = self.processor(image, text=caption, return_tensors="pt", padding="max_length", truncation=True)
        inputs = {key: val.squeeze() for key, val in inputs.items()}
        labels = inputs['input_ids'].clone()
        labels[labels == self.processor.tokenizer.pad_token_id] = -100
        inputs['labels'] = labels
        return inputs


In [4]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./blip_finetuned",
    evaluation_strategy="no",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=2,
    learning_rate=1e-5,
    save_steps=500,
    save_total_limit=2,
    fp16=True,
    gradient_accumulation_steps=4,
    logging_dir="./logs"
)




In [5]:
from torch.utils.data import DataLoader
import re
dataset = BlipDataset("captions.csv", "ImageList", processor)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=lambda data: {
        'pixel_values': torch.stack([f['pixel_values'] for f in data]),
        'input_ids': torch.stack([f['input_ids'] for f in data]),
        'attention_mask': torch.stack([f['attention_mask'] for f in data]),
        'labels': torch.stack([f['labels'] for f in data])
    },
)

trainer.train()

model.save_pretrained("./blip_finetuned")


Step,Training Loss


In [6]:
model = BlipForConditionalGeneration.from_pretrained("./blip_finetuned")
model.to(device)

generated_caption = generate_caption_blip(test_image_path)
print(f"Generated Caption (Fine-Tuned BLIP): {generated_caption}")

Generated Caption (Fine-Tuned BLIP): the change in atmospheric co2 emissions of various air and surface systems from 1850 to 1993 and from 1999 and the same air temperature range based on a comparison table of two plots from different air and three panels on a two models of the same line from the four models in the same range are the same scenarios for the changes that have been for each scenario is an uncertain to different scenario
