<a href="https://colab.research.google.com/github/sriram7737/AI-Call-Center-Prototype-Faster-Whisper-TinyLLaMA-Demo/blob/main/QVLAM_Colab_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Q-VLAM: Quantum-enhanced Vision-Language AI Model
This notebook combines a vision-language model (CLIP + LLaMA) with Grover's quantum search logic for enhanced prompt optimization.

In [None]:
!pip install transformers accelerate peft torch torchvision timm  nvidia-smi qiskit==0.43.1 qiskit-aer==0.12.0

Collecting qiskit==0.43.1
  Downloading qiskit-0.43.1.tar.gz (9.6 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting qiskit-aer==0.12.0
  Downloading qiskit_aer-0.12.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.2 kB)
Collecting qiskit-terra==0.24.1 (from qiskit==0.43.1)
  Downloading qiskit_terra-0.24.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting qiskit-ibmq-provider==0.20.2 (from qiskit==0.43.1)
  Downloading qiskit_ibmq_provider-0.20.2-py3-none-any.whl.metadata (14 kB)
Collecting requests-ntlm<=1.1.0 (from qiskit-ibmq-provider==0.20.2->qiskit==0.43.1)
  Downloading requests_ntlm-1.1.0-py2.py3-none-any.whl.metadata (938 bytes)
Collecting numpy>=1.16.3 (from qiskit-aer==0.12.0)
  Downloading numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Co

In [None]:
from transformers import CLIPProcessor, CLIPModel, AutoTokenizer, AutoModelForCausalLM
import torch
from PIL import Image
from qiskit_aer import Aer
from qiskit import QuantumCircuit
from qiskit.algorithms import Grover, AmplificationProblem

ModuleNotFoundError: No module named 'qiskit.algorithms'

## Load Models

In [None]:
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
llama_model = AutoModelForCausalLM.from_pretrained("sriram7737/llama-3.2-1b")
tokenizer = AutoTokenizer.from_pretrained("sriram7737/llama-3.2-1b")

## Define Vision-Language Model

In [None]:
class SimpleVisionLanguageModel(torch.nn.Module):
    def __init__(self, vision_model, text_model, projection_dim=512):
        super().__init__()
        self.vision_model = vision_model.vision_model
        self.text_model = text_model
        self.projector = torch.nn.Linear(self.vision_model.config.hidden_size, projection_dim)
        self.gate = torch.nn.Linear(projection_dim, text_model.config.hidden_size)

    def forward(self, image, input_ids, attention_mask):
        with torch.no_grad():
            vision_outputs = self.vision_model(pixel_values=image).last_hidden_state[:, 0, :]
        vision_embeds = self.projector(vision_outputs)
        gated_embeds = self.gate(vision_embeds)
        batch_size = input_ids.shape[0]
        vision_tokens = gated_embeds.unsqueeze(1).expand(batch_size, 1, -1)
        inputs_embeds = self.text_model.model.embed_tokens(input_ids)
        combined = torch.cat([vision_tokens, inputs_embeds], dim=1)
        return self.text_model(inputs_embeds=combined, attention_mask=attention_mask)

## Caption Generation Function

In [None]:
def generate_caption(image_path, prompt="Describe the image:"):
    image = Image.open(image_path).convert("RGB")
    inputs = clip_processor(images=image, return_tensors="pt")
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    attention_mask = torch.ones_like(input_ids)

    model = SimpleVisionLanguageModel(clip_model, llama_model)
    model.eval()

    with torch.no_grad():
        outputs = model(inputs['pixel_values'], input_ids, attention_mask)
        generated_ids = torch.argmax(outputs.logits, dim=-1)
        text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    return text

## Quantum Search Simulation

In [None]:
def simulate_grover(target_index=1):
    n_qubits = 2
    oracle = QuantumCircuit(n_qubits)
    oracle.x(target_index)
    oracle.cz(0, 1)
    oracle.x(target_index)
    problem = AmplificationProblem(oracle)
    grover = Grover(oracle=problem)
    backend = Aer.get_backend('aer_simulator')
    result = grover.run(backend)
    return result

## Run Example

In [None]:
# caption = generate_caption("sample.jpg")
# print("Generated Caption:", caption)

# grover_result = simulate_grover()
# print("Grover Result:", grover_result)