## Install all the requirements

In [None]:
!pip install -U transformers 
!pip install huggingface-hub

Collecting transformers
  Downloading transformers-4.46.1-py3-none-any.whl.metadata (44 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.2-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.7 kB)
Downloading transformers-4.46.1-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading tokenizers-0.20.2-cp312-cp312-macosx_11_0_arm64.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.19.1
    Uninstalling tokenizers-0.19.1:
      Successfully uninstalled tokenizers-0.19.1
  Attempting uninstall: transformers
    Found existing installation: transformers 4.44.2
    Uninstalling transformers-4.44.2:
  

## Log in to Hugging Face using your Token
If you can't paste it here, run these commands in the terminal.

In [None]:
!git config --global credential.helper store
!huggingface-cli login

In [None]:
import torch 
from PIL import Image

from transformers import (
    MllamaForConditionalGeneration,
    AutoProcessor,
    GenerationConfig,
)

## Loading the model

🚨 REMINDER: You need access to the model to use it.
Check the authorization [here](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision)

In [None]:
model_id = "meta-llama/Llama-3.2-11B-Vision"

model = MllamaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

model.gradient_checkpointing_enable()

processor = AutoProcessor.from_pretrained(model_id)

## Function definition

In [None]:
def answer_image_question(prompt, image_file):
  image = Image.open(image_file)
  prompt = f"<|image|><|begin_of_text|>{prompt}"

  inputs = processor(image, prompt, return_tensors="pt").to("cpu")
  generation_config = GenerationConfig.from_pretrained(model_id)
  generation_config.gradient_checkpointing = True
  output = model.generate(**inputs, generation_config=generation_config, 
                          max_new_tokens=250)

  return processor.decode(output[0])

## Testing the function

The time execution will depend on your machine components. (RAM, GPU, etc)

In [None]:
# === CALLING THE FUNCTION ===
image_file = "./imgs/ai-generated-stray-cat-in-danger-background-animal-background-photo.jpg"
prompt = "What do you see in this image?"
answer = answer_image_question(prompt, image_file)
print(answer)