In [None]:
from google.colab import drive

# Re-mount the drive
drive.mount('/content/drive', force_remount=True)
print("Drive re-mounted successfully.")

Mounted at /content/drive
✅ Drive re-mounted successfully.


In [1]:

!pip install -q transformers datasets accelerate pillow nltk

import os, json, types, inspect, random
from PIL import Image, UnidentifiedImageError
from torch.utils.data import Dataset, random_split
from transformers import (
    BlipProcessor,
    BlipForConditionalGeneration,
    Trainer,
    TrainingArguments,
)
import torch
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True) 
print(" Libraries ready.")


# Paths
json_path = "/content/drive/MyDrive/blip_model/final_train_description.json"
base_image_dir = "/content/drive/MyDrive/blip_model"
output_dir = "/content/drive/MyDrive/blip_model/blip_scene_finetuned_fulldata_2"

test_results_path = os.path.join(output_dir, "test_results.json")

os.makedirs(output_dir, exist_ok=True)
print("Paths configured.")

model_name = "Salesforce/blip-image-captioning-base"
processor = BlipProcessor.from_pretrained(model_name)
model = BlipForConditionalGeneration.from_pretrained(model_name)
print("BLIP model loaded.")

# Dataset Class
class SceneUnderstandingDataset(Dataset):
    def __init__(self, items, base_dir, processor, max_len=64):
        self.base_dir = base_dir
        self.processor = processor
        self.max_len = max_len

        valid = []
        skipped = 0
        for item in items:
            img_path = os.path.join(base_dir, item["image"])
            if not os.path.exists(img_path):
                skipped += 1
                continue
            try:
                with Image.open(img_path) as im:
                    im.verify()
            except Exception:
                skipped += 1
                continue
            valid.append(item)
        self.data = valid
        print(f"Dataset ready: {len(self.data)} valid, skipped {skipped}")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        img_path = os.path.join(self.base_dir, item["image"])
        try:
            image = Image.open(img_path).convert("RGB")
        except Exception:
            image = Image.new("RGB", (224,224), (0,0,0))
            caption = "unreadable image"
        else:
            caption = item.get("caption", "")

        enc = self.processor(
            images=image,
            text=caption,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
        )

        input_ids = enc["input_ids"].squeeze(0)
        labels = input_ids.clone()
        pad_id = self.processor.tokenizer.pad_token_id or 0
        labels[labels == pad_id] = -100

        return {
            "pixel_values": enc["pixel_values"].squeeze(0),
            "input_ids": input_ids,
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": labels,
            "caption": caption,
            "image_file": item["image"],
        }

# Load JSON & Split
with open(json_path, "r", encoding="utf-8") as f:
    data = json.load(f)

random.shuffle(data)
split = int(0.9 * len(data))
train_items, test_items = data[:split], data[split:]

train_dataset = SceneUnderstandingDataset(train_items, base_image_dir, processor)
test_dataset = SceneUnderstandingDataset(test_items, base_image_dir, processor)

# Training Arguments
desired_args = {
    "output_dir": output_dir,
    "per_device_train_batch_size": 2,
    "gradient_accumulation_steps": 4,
    "num_train_epochs": 6,
    "learning_rate": 5e-6,
    "weight_decay": 0.02,
    "warmup_ratio": 0.05,
    "save_strategy": "epoch",
    "evaluation_strategy": "epoch", 
    "save_total_limit": 2,
    "logging_steps": 10,
    "load_best_model_at_end": True,
    "metric_for_best_model": "eval_loss",
    "remove_unused_columns": False,
    "fp16": torch.cuda.is_available(),
    "report_to": "none",
}

# Filter keys by version
ta_params = inspect.signature(TrainingArguments.__init__).parameters
supported = set(ta_params.keys()) - {"self", "kwargs"}
filtered_args = {k: v for k, v in desired_args.items() if k in supported}

# Safety Fix: Disable load_best_model if no eval strategy
if "evaluation_strategy" not in supported:
    filtered_args.pop("load_best_model_at_end", None)
    filtered_args.pop("metric_for_best_model", None)
    print("No evaluation strategy in this transformers version — disabled best model loading.")

training_args = TrainingArguments(**filtered_args)
print("TrainingArguments created successfully.")

# Patch BLIP forward
def patched_forward(self, *args, **kwargs):
    kwargs.pop("num_items_in_batch", None)
    return self.old_forward(*args, **kwargs)

model.old_forward = model.forward
model.forward = types.MethodType(patched_forward, model)
print("Patched model.forward().")

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset if len(test_dataset) > 0 else None,
)

# Train
print("Training...")
trainer.train()
print("Training done.")

# Save
model.save_pretrained(output_dir)
processor.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")

# Evaluation (BLEU)
if len(test_dataset) > 0:
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model_eval = BlipForConditionalGeneration.from_pretrained(output_dir).to(device)
    model_eval.eval()

    def bleu(ref, hyp):
        ref_tokens = [nltk.word_tokenize(ref.lower())]
        hyp_tokens = nltk.word_tokenize(hyp.lower())
        return sentence_bleu(ref_tokens, hyp_tokens, weights=(0.25,0.25,0.25,0.25),
                             smoothing_function=SmoothingFunction().method1)

    total_bleu, count = 0, 0
    results = []

    for s in test_dataset:
        try:
            pv = s["pixel_values"].unsqueeze(0).to(device)
            gt = s["caption"]
            with torch.no_grad():
                out = model_eval.generate(pixel_values=pv, max_length=64, num_beams=3)
            gen = processor.decode(out[0], skip_special_tokens=True)
            score = bleu(gt, gen)
            total_bleu += score
            count += 1
            results.append({
                "image_file": s["image_file"],
                "ground_truth": gt,
                "generated": gen,
                "bleu": score
            })
        except Exception as e:
            print("Eval error:", e)
            continue

    if count > 0:
        avg_bleu = total_bleu / count
        print(f"\nAverage BLEU-4 ({count} samples): {avg_bleu:.4f}")
        # Saving results to the specified path: test_results_path
        with open(test_results_path, "w", encoding="utf-8") as f:
            json.dump(results, f, indent=2)
        print(f"Results saved to {test_results_path}")
    else:
        print("No valid test samples.")
else:
    print("Test dataset empty; skipping evaluation.")

print("All done!")

In [None]:
import os
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
from google.colab import drive

drive.mount('/content/drive', force_remount=True)
print("Google Drive mounted.")

output_dir = "/content/drive/MyDrive/blip_model/blip_scene_finetuned_fulldata_2"
base_image_dir = "/content/drive/MyDrive/blip_model"

test_image_file = "test_image_5.jpg"
test_image_path = os.path.join(base_image_dir, test_image_file)

if not os.path.exists(test_image_path):
    print(f"Error: Test image not found at {test_image_path}.")
    print("Please update 'test_image_file' to an existing image name in your Google Drive.")
else:
    try:
        device = "cuda" if torch.cuda.is_available() else "cpu"
        processor = BlipProcessor.from_pretrained(output_dir)
        model = BlipForConditionalGeneration.from_pretrained(output_dir).to(device)
        model.eval()
        print(f"Fine-tuned model loaded successfully from {output_dir} to {device}.")

        image = Image.open(test_image_path).convert("RGB")

        inputs = processor(images=image, return_tensors="pt").to(device)

        print("\nGenerating caption...")
        with torch.no_grad():
            generated_ids = model.generate(
                pixel_values=inputs.pixel_values,
                max_length=64,
                num_beams=4
            )

        # Decode the generated IDs to a readable string
        generated_caption = processor.decode(generated_ids.squeeze(0), skip_special_tokens=True)

        # --- Output Result ---
        print("\n--- Generation Result ---")
        print(f"Image File: {test_image_file}")
        print(f"Generated Caption: {generated_caption}")
        print("-------------------------")

    except Exception as e:
        print(f"An error occurred during testing. Check if all required files (config.json, pytorch_model.bin, preprocessor_config.json) are in {output_dir}. Error: {e}")

Mounted at /content/drive
✅ Google Drive mounted.
✅ Fine-tuned model loaded successfully from /content/drive/MyDrive/blip_model/blip_scene_finetuned_fulldata_2 to cuda.

🚀 Generating caption...

--- Generation Result ---
Image File: test_image_5.jpg
Generated Caption: **a low - angle shot of a person falling from a high - rise building facing towards the ground**
-------------------------
