In [3]:

%pip install diffusers transformers accelerate torch helium smolagents smolagents[openai] wikipedia pytube duckduckgo_search



Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
from diffusers import StableDiffusionPipeline
import torch
import os
from dotenv import load_dotenv

# --- Load environment variables ---
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")

# --- Initialize pipeline ---
pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    use_auth_token=HF_TOKEN
)

device = "cuda" if torch.cuda.is_available() else "cpu"
pipe.to(device)

# --- SmolAgent function ---
def smolagent_generate(prompt: str, output_path: str = "generated_image.png"):
    print(f"[SmolAgent] Generating image for prompt: '{prompt}'")
    image = pipe(prompt).images[0]
    image.save(output_path)
    print(f"[SmolAgent] Image saved as '{output_path}'")

# --- Example Usage ---
if __name__ == "__main__":
    user_prompt = input("Enter a prompt for SmolAgent to draw: ")
    smolagent_generate(user_prompt)

Keyword arguments {'use_auth_token': None} are not expected by StableDiffusionPipeline and will be ignored.


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

[SmolAgent] Generating image for prompt: 'generate a image of a snow capped mountain'


  0%|          | 0/50 [00:00<?, ?it/s]

[SmolAgent] Image saved as 'generated_image.png'


In [26]:
from PIL import Image
import requests
from io import BytesIO

image_urls = [
    "https://upload.wikimedia.org/wikipedia/commons/e/e8/The_Joker_at_Wax_Museum_Plus.jpg", # Joker image
    "https://upload.wikimedia.org/wikipedia/en/9/98/Joker_%28DC_Comics_character%29.jpg" # Joker image
]

images = []
for url in image_urls:
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36" 
    }
    response = requests.get(url,headers=headers)
    image = Image.open(BytesIO(response.content)).convert("RGB")
    images.append(image)

In [1]:
import requests
from PIL import Image
from io import BytesIO
import torch
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import os

# 1. Download image from Wikipedia direct image URL
wiki_image_url = "https://upload.wikimedia.org/wikipedia/commons/e/e8/The_Joker_at_Wax_Museum_Plus.jpg"

response = requests.get(wiki_image_url)
print("Status code:", response.status_code)
if response.status_code != 200:
    raise Exception(f"Failed to download image: HTTP {response.status_code}")

try:
    image = Image.open(BytesIO(response.content)).convert("RGB")
except Exception as e:
    with open("downloaded_image.jpg", "wb") as f:
        f.write(response.content)
    raise RuntimeError("Failed to open image. Saved as downloaded_image.jpg") from e

# 2. Load processor and model
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl").to(device)

# 3. Prepare inputs (image + prompt)
prompt = (
    "Describe the costume and makeup that the comic character in these photos is wearing and return the description. "
    "Tell me if the guest is The Joker or Wonder Woman."
)

inputs = processor(images=image, text=prompt, return_tensors="pt").to(device)

# 4. Generate output
outputs = model.generate(**inputs, max_new_tokens=128)

# 5. Decode and print description
description = processor.decode(outputs[0], skip_special_tokens=True)
print("Model output:\n", description)



Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Status code: 200


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model output:
 a joker


In [None]:
import requests
from PIL import Image
from io import BytesIO
import torch
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import os

# 1. Download image from Wikipedia direct image URL
wiki_image_url = "https://upload.wikimedia.org/wikipedia/commons/e/e8/The_Joker_at_Wax_Museum_Plus.jpg"

response = requests.get(wiki_image_url)
print("Status code:", response.status_code)
if response.status_code != 200:
    raise Exception(f"Failed to download image: HTTP {response.status_code}")

# 2. Try to open and save the image temporarily
try:
    image = Image.open(BytesIO(response.content)).convert("RGB")
    image.save("downloaded_image.jpg")  
except Exception as e:
    raise RuntimeError("Failed to open or save image from URL.") from e

# 3. Load processor and model
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl").to(device)

# 4. Prepare inputs (image + prompt)
prompt = (
    "Describe the costume and makeup that the comic character in these photos is wearing and return the description. "
    "Tell me if the guest is The Joker or Wonder Woman."
)
inputs = processor(images=image, text=prompt, return_tensors="pt").to(device)

# 5. Generate output
with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=128)

# 6. Decode and print description
description = processor.decode(outputs[0], skip_special_tokens=True)
print("Model output:\n", description)

# 7. Delete the saved image after processing
if os.path.exists("downloaded_image.jpg"):
    os.remove("downloaded_image.jpg")
    print("Temporary image file deleted.")


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Status code: 200


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model output:
 a joker
Temporary image file deleted.


In [None]:
import os

if os.path.exists("downloaded_image.jpg"):
    os.remove("downloaded_image.jpg")
    print("Temporary image file deleted.")