<a href="https://colab.research.google.com/github/dp-08/Gen-AI/blob/main/Human_face_generator_(VAE).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorflow keras diffusers transformers accelerate

In [None]:
import tensorflow as tf
from PIL import Image
from diffusers import StableDiffusionPipeline
import io
import os

# Set up the device for computation (GPU is highly recommended)
# You might need to set up the environment for GPU usage first.

# --- Configuration ---
MODEL_ID = "runwayml/stable-diffusion-v1-5" # A well-known model for this task
PROMPT = "A photorealistic portrait of an old man with a kind smile, detailed eyes, wearing a tweed jacket, soft studio lighting."
FACE_PROMPT = PROMPT # The prompt specific to the face generation
OUTPUT_FILENAME = "generated_face.png"

# --- 1. Load the Model and Components (VAE, U-Net/Transformer, Text Encoder) ---
try:
    # StableDiffusionPipeline encapsulates the required components:
    # 1. Text Encoder (Transformer): Encodes the prompt into a latent representation.
    # 2. VAE (Variational Autoencoder): Encodes/Decodes images to/from a latent space.
    # 3. U-Net (Transformer-based): The core diffusion model that denoises the image latent.

    print(f"Loading Stable Diffusion pipeline: {MODEL_ID}...")
    pipeline = StableDiffusionPipeline.from_pretrained(MODEL_ID, use_safetensors=True)

    # Optional: Move the pipeline to GPU if available and configured
    # if tf.config.list_physical_devices('GPU'):
    #     print("Moving pipeline to GPU.")
    #     # Note: Diffusers often manages this automatically, or you might use .to("cuda")
    #     # depending on whether you're using PyTorch or TensorFlow backend.
    #     # For this example, let's assume the default configuration works.

    print("Model components loaded successfully.")

    # --- 2. Generate the Image ---
    print(f"Generating image with prompt: '{FACE_PROMPT}'")

    # The 'pipeline' handles the full VAE-Transformer flow:
    # a. Text is encoded by the Text Encoder (Transformer).
    # b. The U-Net (Transformer) takes the text embedding and a noisy VAE latent code,
    #    and repeatedly denoises it (the 'diffusion' process).
    # c. The final latent code is passed to the VAE's decoder to create the full image.

    # Note: 'num_inference_steps' controls the quality and speed. ~25-50 is standard.
    image = pipeline(
        prompt=FACE_PROMPT,
        height=512,
        width=512,
        num_inference_steps=30,
        guidance_scale=7.5 # How much the prompt should influence the image
    ).images[0]

    # --- 3. Save the Result ---
    print(f"Image generation complete. Saving to {OUTPUT_FILENAME}")
    image.save(OUTPUT_FILENAME)
    print(f"Successfully saved the generated face image.")

    # Optional: Display the image (if running in an environment like Jupyter)
    # image.show()

except Exception as e:
    print(f"\nAn error occurred during execution:")
    print(f"Please ensure you have sufficient VRAM (typically 8GB+) and that the libraries are installed.")
    print(f"Error details: {e}")