In [None]:
!pip install transformers datasets torchvision scikit-learn nltk sacrebleu


Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.6.0->torchvision)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cud

In [None]:
# Install Kaggle CLI
!pip install kaggle

# Move kaggle.json
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download Flickr8k dataset
!kaggle datasets download -d adityajn105/flickr8k
!unzip flickr8k.zip -d dataset


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: dataset/Images/2844846111_8c1cbfc75d.jpg  
  inflating: dataset/Images/2844963839_ff09cdb81f.jpg  
  inflating: dataset/Images/2845246160_d0d1bbd6f0.jpg  
  inflating: dataset/Images/2845691057_d4ab89d889.jpg  
  inflating: dataset/Images/2845845721_d0bc113ff7.jpg  
  inflating: dataset/Images/2846037553_1a1de50709.jpg  
  inflating: dataset/Images/2846785268_904c5fcf9f.jpg  
  inflating: dataset/Images/2846843520_b0e6211478.jpg  
  inflating: dataset/Images/2847514745_9a35493023.jpg  
  inflating: dataset/Images/2847615962_c330bded6e.jpg  
  inflating: dataset/Images/2847859796_4d9cb0d31f.jpg  
  inflating: dataset/Images/2848266893_9693c66275.jpg  
  inflating: dataset/Images/2848571082_26454cb981.jpg  
  inflating: dataset/Images/2848895544_6d06210e9d.jpg  
  inflating: dataset/Images/2848977044_446a31d86e.jpg  
  inflating: dataset/Images/2849194983_2968c72832.jpg  
  inflating: dataset/Images/2850719435_

In [None]:
import os
from PIL import Image
from torch.utils.data import Dataset
import torch

from transformers import BlipProcessor

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

class FlickrDataset(Dataset):
    def __init__(self, img_dir, captions_file, processor, max_samples=5000):
        self.img_dir = img_dir
        self.processor = processor
        self.samples = []

        # Read captions file
        with open(captions_file, 'r') as f:
            lines = f.readlines()

        for line in lines:
            img_name, caption = line.strip().split('\t')
            img_name = img_name.split('#')[0]
            caption = caption.strip()
            self.samples.append((img_name, caption))

        # Optional subsample
        if max_samples:
            self.samples = self.samples[:max_samples]

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        img_name, caption = self.samples[idx]
        img_path = os.path.join(self.img_dir, img_name)
        image = Image.open(img_path).convert('RGB')
        encoding = self.processor(images=image, text=caption, padding='max_length', return_tensors='pt', max_length=128, truncation=True)
        encoding = {k: v.squeeze() for k, v in encoding.items()}
        return encoding


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [None]:
import pandas as pd
import os
from torch.utils.data import Dataset
from PIL import Image

class FlickrDataset(Dataset):
    def __init__(self, img_dir, captions_file, processor, max_samples=5000):
        self.img_dir = img_dir
        self.processor = processor

        # ✅ Read CSV file
        df = pd.read_csv(captions_file)

        # ✅ Group captions by image filename (optional: only if you want all captions per image)
        # grouped = df.groupby('image')['caption'].apply(list).reset_index()

        # ✅ For simple setup: take only unique image-caption pairs
        data = df[['image', 'caption']].drop_duplicates()

        # ✅ Limit samples
        if max_samples:
            data = data.sample(n=min(max_samples, len(data)), random_state=42).reset_index(drop=True)

        self.samples = data

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.samples.iloc[idx]['image'])
        caption = self.samples.iloc[idx]['caption']

        image = Image.open(img_path).convert('RGB')

        # Process image + text (caption) together
        inputs = self.processor(images=image, text=caption, return_tensors="pt", padding='max_length', truncation=True)

        # Remove extra batch dimension from processor output
        inputs = {k: v.squeeze(0) for k, v in inputs.items()}
        return inputs


In [None]:
from torch.utils.data import random_split, DataLoader

IMG_DIR = "/content/dataset/Images"
CAPTIONS_FILE = "/content/dataset/captions.txt"

dataset = FlickrDataset(IMG_DIR, CAPTIONS_FILE, processor, max_samples=5000)

train_size = int(0.9 * len(dataset))
train_ds, val_ds = random_split(dataset, [train_size, len(dataset) - train_size])

train_loader = DataLoader(train_ds, batch_size=4, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=4)


In [None]:
# =======================
# 📦 Install dependencies
# =======================
!pip install transformers datasets timm pillow tqdm

# =======================
# 📚 Import libraries
# =======================
import os
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
from tqdm import tqdm
from torch.optim import AdamW

# =======================
# ⚙️ Setup device
# =======================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# =======================
# 📥 Load BLIP model (public)
# =======================
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

# =======================
# 📂 Paths
# =======================
IMG_DIR = "/content/dataset/Images/"   # <-- Update this path if different
CAPTIONS_FILE = "/content/dataset/captions.txt" # <-- Your captions.txt file

# =======================
# 🖼️ Custom Dataset class (handles missing images)
# =======================
class FlickrDataset(Dataset):
    def __init__(self, img_dir, captions_file, processor, max_samples=500):
        self.img_dir = img_dir
        self.processor = processor
        self.data = []

        valid_images = set(os.listdir(img_dir))

        with open(captions_file, 'r') as f:
            lines = f.readlines()

        skipped = 0
        for line in lines[:max_samples]:
            line = line.strip()
            if not line:
                continue

            # Auto-detect separator
            if ',' in line:
                sep = ','
            elif '\t' in line:
                sep = '\t'
            else:
                continue

            parts = line.split(sep, 1)
            if len(parts) != 2:
                continue

            image, caption = parts
            image = image.strip()
            caption = caption.strip()

            if image in valid_images:
                self.data.append((image, caption))
            else:
                skipped += 1

        print(f"✅ Loaded {len(self.data)} samples. ❌ Skipped {skipped} samples (missing images).")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        image_name, caption = self.data[idx]
        image_path = os.path.join(self.img_dir, image_name)
        image = Image.open(image_path).convert("RGB")

        encoding = self.processor(text=caption, images=image, return_tensors="pt", padding="max_length", truncation=True, max_length=64)
        encoding = {k: v.squeeze(0) for k, v in encoding.items()}
        return encoding

# =======================
# 🔗 Collate function
# =======================
def collate_fn(batch):
    return {k: torch.stack([d[k] for d in batch]) for k in batch[0]}

# =======================
# 📊 Prepare DataLoader
# =======================
dataset = FlickrDataset(IMG_DIR, CAPTIONS_FILE, processor, max_samples=500)

if len(dataset) == 0:
    raise ValueError("❌ No valid samples found. Please check your dataset paths and captions file!")

train_size = int(0.9 * len(dataset))
train_ds, val_ds = random_split(dataset, [train_size, len(dataset) - train_size])

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_ds, batch_size=32, collate_fn=collate_fn)

# =======================
# 🚀 Fast Training Loop
# =======================
# Freeze vision encoder (optional but speeds up)
for name, param in model.named_parameters():
    if "vision_model" in name:
        param.requires_grad = False

optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=5e-5)
EPOCHS = 1

model.train()
for epoch in range(EPOCHS):
    loop = tqdm(train_loader, leave=True)
    total_loss = 0

    for batch in loop:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch, labels=batch['input_ids'])
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        loop.set_description(f"Epoch [{epoch+1}/{EPOCHS}]")
        loop.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} - Average Loss: {avg_loss:.4f}")

# =======================
# 💾 Save model
# =======================
output_dir = "/content/blip-finetuned"
model.save_pretrained(output_dir)
processor.save_pretrained(output_dir)
print(f"✅ Model fine-tuning complete and saved to {output_dir}")


Using device: cuda
✅ Loaded 499 samples. ❌ Skipped 1 samples (missing images).


Epoch [1/1]: 100%|██████████| 15/15 [00:35<00:00,  2.39s/it, loss=5.73]


Epoch 1 - Average Loss: 7.6306
✅ Model fine-tuning complete and saved to /content/blip-finetuned


In [1]:
!unzip /content/blip-finetuned.zip -d /content/blip-finetuned/


Archive:  /content/blip-finetuned.zip
   creating: /content/blip-finetuned/content/blip-finetuned/
  inflating: /content/blip-finetuned/content/blip-finetuned/config.json  
  inflating: /content/blip-finetuned/content/blip-finetuned/generation_config.json  
  inflating: /content/blip-finetuned/content/blip-finetuned/tokenizer_config.json  
  inflating: /content/blip-finetuned/content/blip-finetuned/preprocessor_config.json  
  inflating: /content/blip-finetuned/content/blip-finetuned/model.safetensors  
  inflating: /content/blip-finetuned/content/blip-finetuned/vocab.txt  
  inflating: /content/blip-finetuned/content/blip-finetuned/special_tokens_map.json  
  inflating: /content/blip-finetuned/content/blip-finetuned/tokenizer.json  


In [3]:
# =======================
# 📦 Install dependencies (if not already)
# =======================
!pip install transformers pillow

# =======================
# 📚 Import libraries
# =======================
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image

# =======================
# ⚙️ Load fine-tuned model
# =======================
model_path = "/content/blip-finetuned/content/blip-finetuned"  # <-- Path where model was saved
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

processor = BlipProcessor.from_pretrained(model_path)
model = BlipForConditionalGeneration.from_pretrained(model_path).to(device)

# =======================
# 🖼️ Load any image (example)
# =======================
image_path = "/content/AE-Crime-Blog-GettyImages-1127614620-1.jpg"  # <-- Change to your image path
image = Image.open(image_path).convert("RGB")

# =======================
# 🔥 Generate caption
# =======================
inputs = processor(images=image, return_tensors="pt").to(device)
out = model.generate(**inputs, max_length=64)
caption = processor.decode(out[0], skip_special_tokens=True)

print("🖼️ Image Path:", image_path)
print("📝 Generated Caption:", caption)


🖼️ Image Path: /content/AE-Crime-Blog-GettyImages-1127614620-1.jpg
📝 Generated Caption: a girl in a hat is using a laptop


In [6]:
!pip install -U google-generativeai




AIzaSyA4mHi9wTVd4ruEEAGT5mwchfbqi6NTSII

In [9]:
import google.generativeai as genai

# Configure with your API key
genai.configure(api_key="AIzaSyA4mHi9wTVd4ruEEAGT5mwchfbqi6NTSII")

# List available models (to be sure what’s available)
models = genai.list_models()
for m in models:
    print(m.name, "  —  ", m.supported_generation_methods)


models/chat-bison-001   —   ['generateMessage', 'countMessageTokens']
models/text-bison-001   —   ['generateText', 'countTextTokens', 'createTunedTextModel']
models/embedding-gecko-001   —   ['embedText', 'countTextTokens']
models/gemini-1.0-pro-vision-latest   —   ['generateContent', 'countTokens']
models/gemini-pro-vision   —   ['generateContent', 'countTokens']
models/gemini-1.5-pro-latest   —   ['generateContent', 'countTokens']
models/gemini-1.5-pro-001   —   ['generateContent', 'countTokens', 'createCachedContent']
models/gemini-1.5-pro-002   —   ['generateContent', 'countTokens', 'createCachedContent']
models/gemini-1.5-pro   —   ['generateContent', 'countTokens']
models/gemini-1.5-flash-latest   —   ['generateContent', 'countTokens']
models/gemini-1.5-flash-001   —   ['generateContent', 'countTokens', 'createCachedContent']
models/gemini-1.5-flash-001-tuning   —   ['generateContent', 'countTokens', 'createTunedModel']
models/gemini-1.5-flash   —   ['generateContent', 'countToke

In [13]:
# 📦 Install dependencies (if not already)
!pip install -U google-generativeai transformers pillow

# 📚 Import libraries
import google.generativeai as genai
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch

# ⚙️ Configure Gemini API Key (replace YOUR_API_KEY)
genai.configure(api_key="AIzaSyA4mHi9wTVd4ruEEAGT5mwchfbqi6NTSII")

# ✅ Load Gemini 1.5 Pro (the best available)
gemini_model = genai.GenerativeModel('models/gemini-1.5-pro-latest')

# ⚙️ Load BLIP fine-tuned model (Image Captioning)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
blip_model_path = "/content/blip-finetuned/content/blip-finetuned"
blip_processor = BlipProcessor.from_pretrained(blip_model_path)
blip_model = BlipForConditionalGeneration.from_pretrained(blip_model_path).to(device)

# 🖼️ Load image
image_path = "/content/istockphoto-1481901859-612x612.jpg"
image = Image.open(image_path).convert("RGB")

# 🔥 Generate caption (image description)
inputs = blip_processor(images=image, return_tensors="pt").to(device)
out = blip_model.generate(**inputs, max_length=64)
caption = blip_processor.decode(out[0], skip_special_tokens=True)

# 🕵️‍♂️ Prepare crime investigation prompt (for Gemini)
investigation_prompt = (
    f"You are a professional crime scene investigator. Carefully analyze the scene description "
    f"to determine if any crime took place, identify possible suspects, evidence, and explain "
    f"your reasoning with clear details.\n\n"
    f"Scene Description: {caption}\n\n"
    f"Your detailed investigation report:"
)

# 🧠 Generate reasoning with Gemini
response = gemini_model.generate_content(investigation_prompt)
investigation_solution = response.text

# 🖼️ Output results
print(f"Image Description: {caption}")
print(f"LLM Decision: {investigation_solution}")


Image Description: a person in a red rain suit crouchs on the ground to the ground in front of a police car
LLM Decision: ## Crime Scene Investigation Report

**Date:** October 26, 2023
**Time:**  (Time of observation not provided - crucial detail needed)
**Location:** (Location not provided - crucial detail needed)
**Reporting Officer:** (Name of reporting officer/observer not provided - crucial detail needed)


**Scene Description:** A person in a red rain suit is crouching on the ground in front of a police car.


**Preliminary Assessment:**

The provided scene description is incredibly vague and insufficient to determine if a crime has taken place.  The observation of a person crouching in front of a police car, while potentially unusual, is not inherently criminal.  Several scenarios, both innocent and criminal, are possible.


**Possible Scenarios (Innocent):**

* **Seeking Assistance:** The individual could be asking for directions, reporting a crime or incident, or seeking help

In [None]:
!zip -r blip-finetuned.zip /content/blip-finetuned
from google.colab import files
files.download('blip-finetuned.zip')


  adding: content/blip-finetuned/ (stored 0%)
  adding: content/blip-finetuned/config.json (deflated 67%)
  adding: content/blip-finetuned/generation_config.json (deflated 29%)
  adding: content/blip-finetuned/tokenizer_config.json (deflated 74%)
  adding: content/blip-finetuned/preprocessor_config.json (deflated 48%)
  adding: content/blip-finetuned/model.safetensors (deflated 7%)
  adding: content/blip-finetuned/vocab.txt (deflated 53%)
  adding: content/blip-finetuned/special_tokens_map.json (deflated 80%)
  adding: content/blip-finetuned/tokenizer.json (deflated 71%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [14]:
# Install HF Hub
!pip install -U huggingface_hub




In [15]:
# Login (you'll paste token in prompt)
from huggingface_hub import notebook_login
notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [16]:
!pip install -U "huggingface_hub[cli]"
!huggingface-cli login


Collecting InquirerPy==0.3.4 (from huggingface_hub[cli])
  Downloading InquirerPy-0.3.4-py3-none-any.whl.metadata (8.1 kB)
Collecting pfzy<0.4.0,>=0.3.1 (from InquirerPy==0.3.4->huggingface_hub[cli])
  Downloading pfzy-0.3.4-py3-none-any.whl.metadata (4.9 kB)
Downloading InquirerPy-0.3.4-py3-none-any.whl (67 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.7/67.7 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pfzy-0.3.4-py3-none-any.whl (8.5 kB)
Installing collected packages: pfzy, InquirerPy
Successfully installed InquirerPy-0.3.4 pfzy-0.3.4

    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    

In [17]:
from huggingface_hub import HfApi

repo_id = "sufyanbinimran/blip-finetuned"  # your repo id

api = HfApi()
api.create_repo(repo_id, repo_type="model", exist_ok=True)


RepoUrl('https://huggingface.co/sufyanbinimran/blip-finetuned', endpoint='https://huggingface.co', repo_type='model', repo_id='sufyanbinimran/blip-finetuned')

In [18]:
from huggingface_hub import upload_folder

repo_id = "sufyanbinimran/blip-finetuned"
folder_path = "/content/blip-finetuned/content/blip-finetuned"  # your fine-tuned model folder

upload_folder(
    folder_path=folder_path,
    repo_id=repo_id,
    repo_type="model"
)


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/sufyanbinimran/blip-finetuned/commit/85ef4f8a8816b6bd4a773af421853ca5b1bcacaf', commit_message='Upload folder using huggingface_hub', commit_description='', oid='85ef4f8a8816b6bd4a773af421853ca5b1bcacaf', pr_url=None, repo_url=RepoUrl('https://huggingface.co/sufyanbinimran/blip-finetuned', endpoint='https://huggingface.co', repo_type='model', repo_id='sufyanbinimran/blip-finetuned'), pr_revision=None, pr_num=None)

In [19]:
from transformers import BlipProcessor, BlipForConditionalGeneration

processor = BlipProcessor.from_pretrained("sufyanbinimran/blip-finetuned")
model = BlipForConditionalGeneration.from_pretrained("sufyanbinimran/blip-finetuned")


preprocessor_config.json:   0%|          | 0.00/431 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/136 [00:00<?, ?B/s]