# Generative AI - Computer Vision

## 1. Image Captioning (Image to Text)

### BLIP (Bootstrapped Language Image Pretraining)

In [1]:
import torch
import torchvision.transforms as transforms
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

  from .autonotebook import tqdm as notebook_tqdm
2025-02-16 16:17:59.615743: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1739704679.627407    6537 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1739704679.630990    6537 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-16 16:17:59.642981: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Load BLIP processor and model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

In [5]:
# Generate caption function
def generate_caption(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt").to(device)
    output = model.generate(**inputs)
    caption = processor.decode(output[0], skip_special_tokens=True)
    return caption

In [6]:
# Test on a sample image
image_path = "img/scene_man_dog.png"
caption = generate_caption(image_path)
print("Generated Caption:", caption)

Generated Caption: a man carrying a dog on his back


## 2. Image Generation (Text-to-Image)

In [None]:
# !pip install accelerate

### 2.1 LCM DreamShaper v7

In [7]:
import torch
from diffusers import DiffusionPipeline
import time

In [8]:
pipe = DiffusionPipeline.from_pretrained("SimianLuo/LCM_Dreamshaper_v7")

Loading pipeline components...: 100%|█████████████| 7/7 [00:00<00:00,  7.85it/s]


In [9]:
pipe.to(torch_device="cuda", torch_dtype=torch.float32)

LatentConsistencyModelPipeline {
  "_class_name": "LatentConsistencyModelPipeline",
  "_diffusers_version": "0.32.2",
  "_name_or_path": "SimianLuo/LCM_Dreamshaper_v7",
  "feature_extractor": [
    "transformers",
    "CLIPImageProcessor"
  ],
  "image_encoder": [
    null,
    null
  ],
  "requires_safety_checker": true,
  "safety_checker": [
    "stable_diffusion",
    "StableDiffusionSafetyChecker"
  ],
  "scheduler": [
    "diffusers",
    "LCMScheduler"
  ],
  "text_encoder": [
    "transformers",
    "CLIPTextModel"
  ],
  "tokenizer": [
    "transformers",
    "CLIPTokenizer"
  ],
  "unet": [
    "diffusers",
    "UNet2DConditionModel"
  ],
  "vae": [
    "diffusers",
    "AutoencoderKL"
  ]
}

In [10]:
prompt = "A girl riding a motorcycle in the rain"

In [11]:
images = pipe(
    prompt=prompt, 
    num_inference_steps=4, 
    guidance_scale=8.0, 
    lcm_origin_steps=50, 
    output_type="pil").images

100%|█████████████████████████████████████████████| 4/4 [00:19<00:00,  4.78s/it]


In [13]:
for img in images:
    img.save(f"img/{time.time()}.png")
    img.show()

### 2.2 Stable Diffusion 1.5

In [None]:
import torch
from diffusers import StableDiffusionPipeline
import IPython.display as display
import time

In [None]:
# Load the Stable Diffusion model
model_id = "runwayml/stable-diffusion-v1-5"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pipeline = StableDiffusionPipeline.from_pretrained(model_id).to(device)

In [None]:
# Function to generate an image from text
def generate_image(prompt, num_inference_steps=50, guidance_scale=7.5):
    image = pipeline(prompt, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale).images[0]
    return image

In [None]:
# Example Usage
prompt = "A futuristic cityscape at sunset, ultra-realistic"
image = generate_image(prompt)

In [None]:
# Save and display the generated image
image.save(f"img/{time.time()}.png")
image.show()

### 2.3 SDXL Turbo

### 2.4 DreamShaper 7

In [None]:
from diffusers import DiffusionPipeline

In [None]:
# Load the DreamShaper model
model_id = "Lykon/dreamshaper-7"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pipeline = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to(device)

In [None]:
# Function to generate an image from text
def generate_image(prompt, num_inference_steps=30, guidance_scale=7.5):
    image = pipeline(prompt, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale).images[0]
    return image

In [None]:
# Example Usage
prompt = "Hot and Spicy"
image = generate_image(prompt)

In [None]:
# Save and display the generated image
image.save(f"{time.time()}.png")
display.display(image)

### 2.3 LCM Dreamshaper 8

In [14]:
from diffusers import AutoPipelineForText2Image, LCMScheduler
import torch

In [15]:
pipe = AutoPipelineForText2Image.from_pretrained('lykon/dreamshaper-8-lcm', torch_dtype=torch.float32)
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
pipe = pipe.to("cuda")

Loading pipeline components...: 100%|█████████████| 7/7 [00:00<00:00, 11.64it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 58.00 MiB. GPU 0 has a total capacity of 5.92 GiB of which 62.12 MiB is free. Including non-PyTorch memory, this process has 5.33 GiB memory in use. Of the allocated memory 5.08 GiB is allocated by PyTorch, and 167.42 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
prompt = "portrait photo of muscular bearded guy in a worn mech suit, light bokeh, intricate, steel metal, elegant, sharp focus, soft lighting, vibrant colors"

generator = torch.manual_seed(0)
image = pipe(prompt, num_inference_steps=15, guidance_scale=2, generator=generator).images[0]  
image.save("./image.png")