<a href="https://colab.research.google.com/github/shubham-gupta19/SmartPet/blob/main/SmartPet_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import time
import os

class BLIP2Model:
    def __init__(self, model_name="Salesforce/blip2-opt-2.7b", device=None, low_memory=False):
        # Set device
        if device is None:
            self.device = "cuda" if torch.cuda.is_available() else "cpu"
        else:
            self.device = device

        print(f"Using device: {self.device}")

        # Load processor and model
        start_time = time.time()
        print(f"Loading BLIP-2 model: {model_name}...")

        self.processor = Blip2Processor.from_pretrained(model_name)

        # Memory optimization settings - without 8-bit quantization
        if low_memory and self.device == "cuda":
            print("Using low memory settings (half precision)...")
            self.model = Blip2ForConditionalGeneration.from_pretrained(
                model_name,
                torch_dtype=torch.float16,  # Use half precision
                device_map="auto",  # Automatically distribute model across GPUs or CPU
                # Removed the 8-bit quantization option
            )
        else:
            self.model = Blip2ForConditionalGeneration.from_pretrained(model_name)
            self.model.to(self.device)

        elapsed_time = time.time() - start_time
        print(f"Model loaded in {elapsed_time:.2f} seconds")

    def process_image(self, image_path):
        try:
            # Load image
            image = Image.open(image_path).convert('RGB')
            print(f"Image loaded: {image_path} (Size: {image.size})")
            return image
        except Exception as e:
            print(f"Error loading image: {e}")
            raise

    def generate_caption(self, image_path, max_length=50, num_beams=5):
        image = self.process_image(image_path)

        # Prepare inputs for the model
        inputs = self.processor(images=image, text="", return_tensors="pt").to(self.device)

        # Generate caption
        start_time = time.time()
        print("Generating caption...")

        with torch.no_grad():
            generated_ids = self.model.generate(
                **inputs,
                max_length=max_length,
                num_beams=num_beams,
                early_stopping=True
            )

        # Decode the generated IDs to text
        generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

        elapsed_time = time.time() - start_time
        print(f"Caption generated in {elapsed_time:.2f} seconds")

        return generated_text

    def answer_question(self, image_path, question, max_length=50, num_beams=5):
        image = self.process_image(image_path)

        # Prepare inputs for the model
        inputs = self.processor(images=image, text=question, return_tensors="pt").to(self.device)

        # Generate answer
        start_time = time.time()
        print(f"Answering question: {question}")

        with torch.no_grad():
            generated_ids = self.model.generate(
                **inputs,
                max_length=max_length,
                num_beams=num_beams,
                early_stopping=True
            )

        # Decode the generated IDs to text
        generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

        elapsed_time = time.time() - start_time
        print(f"Answer generated in {elapsed_time:.2f} seconds")

        return generated_text

In [None]:
!pip install torch torchvision torchaudio transformers pillow accelerate

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl (76.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.3


In [None]:
!pip install -U bitsandbytes



In [None]:
# Authenticate with Hugging Face
from huggingface_hub import login
login()  # This will prompt for your Hugging Face token

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Initialize the model
blip2_model = BLIP2Model(model_name="Salesforce/blip2-opt-2.7b", low_memory=True)

Using device: cuda
Loading BLIP-2 model: Salesforce/blip2-opt-2.7b...
Using low memory settings (half precision)...


model.safetensors.index.json:   0%|          | 0.00/122k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

Model loaded in 180.73 seconds
Image loaded: /content/SmartPet/dog-barking1.jpg (Size: (632, 475))
Generating caption...
Caption generated in 2.55 seconds
Caption: a small dog with its mouth open on a white background


In [None]:
# Generate caption for dog image
dog_image_path = "/content/SmartPet/excessive-barking1.jpg"  # Update with your actual image path
caption = blip2_model.generate_caption(dog_image_path)
print(f"Caption: {caption}")

Image loaded: /content/SmartPet/excessive-barking1.jpg (Size: (1000, 450))
Generating caption...
Caption generated in 0.49 seconds
Caption: a dog yawning with its mouth open
