# General Setup

# Class

In [1]:
import os
import json
import logging
from typing import Dict, List, Tuple, Any
from dotenv import load_dotenv
import torch
from PIL import Image
from together import Together
from src.prompt_scheme import SceneList
from diffusers import StableDiffusionPipeline, UniPCMultistepScheduler
from src.models import MultiPromptPipelineApproach1

In [2]:
# Set up module-level logging.
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s: %(message)s"))
logger.addHandler(stream_handler)

In [3]:
class StoryboardGenerator:
    ORIENTATIONS: List[str] = [
        "Front View", "Profile View", "Back View", "From Behind", "From Above",
        "From Below", "Three-Quarters View", "Long Shot", "Three-Quarters Rear View"
    ]

    CAMERA_SHOTS: List[str] = [
        "Aerial View", "Bird’s-Eye View", "Close-Up", "Cowboy Shot", "Dolly Zoom",
        "Dutch Angle", "Establishing Shot", "Extreme Close-Up", "Extreme Long Shot",
        "Full Shot", "Long Shot", "Medium Close-Up", "Medium Long Shot", "Medium Shot",
        "Over-the-Shoulder Shot", "Point-of-View Shot", "Two-Shot", "Fisheye Shot",
        "Worm's Eye", "Low-Angle Shot", "Macro Shot", "Tilt-Shift Shot", "Telephoto Shot"
    ]
    
    def __init__(
        self, 
        script: str, 
        characters: Dict[str, Dict[str, str]], 
        style: str = "storyboard", 
        prompt_weights: List[float] = [2, 1.0, 1.2, 1.5, 0.9],  # used only for 'prompt_weights' generation and 'modified-cfg'
        temperature: float = 0.7,
        device: str = "cpu", 
        seed: int = 42
    ) -> None:
        load_dotenv()
        self.together = Together()
        self.script: str = script
        self.characters: Dict[str, Dict[str, str]] = characters
        self.style: str = style
        self.prompt_weights: List[float] = prompt_weights
        self.temperature: float = temperature
        self.device: str = device
        self.seed: int = seed
        self.scenes: Any = None
        self.formatted_prompts: Any = None
        
        # valid_generation_types = {"unique", "prompt_weights", "modified-cfg"}
        self.current_generation_type: str = ""
        # if self.generation_type not in valid_generation_types:
            # raise ValueError(f"Invalid generation_type: {self.generation_type}. Must be one of {valid_generation_types}")
        
        self.pipe: Any = None
        
    def _setup_pipeline(self, generation_type: str) -> Any:
        if generation_type in {"unique", "prompt_weights"}:
            pipe = StableDiffusionPipeline.from_pretrained(
                "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16
            )
        elif generation_type == "modified-cfg":
            pipe = MultiPromptPipelineApproach1.from_pretrained(
                "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16
            )
        else:
            raise ValueError(f"Unsupported generation type: {generation_type}")
        
        pipe = pipe.to(self.device)
        pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
        pipe.enable_model_cpu_offload()
        pipe.enable_attention_slicing()
        return pipe
    
    def _get_pipeline(self, generation_type: str) -> Any:
        # Reload only if the requested type is different from the current one.
        if self.current_generation_type != generation_type or self.pipe is None:
            self.pipe = self._setup_pipeline(generation_type)
            self.current_generation_type = generation_type
        return self.pipe


    def _build_character_description(self, char_info: Dict[str, str]) -> str:
        """
        Generates a textual description for a character given its attribute dictionary.
        """
        features = [
            char_info.get("ethnicity", ""),
            char_info.get("age", ""),
            char_info.get("gender", ""),
            char_info.get("hair", ""),
            char_info.get("facial_hair", ""),
            char_info.get("body_type", ""),
            f"wearing {char_info.get('clothing', '')}",
            f"with {char_info.get('accessories', '')}" if char_info.get("accessories") else ""
        ]
        return ", ".join(filter(None, features))
    
    def input_to_json(self) -> List[Dict[str, Any]]:
        """
        Converts the script and character descriptions into a JSON structure for storyboard generation.
        """
        character_descriptions = {
            name: self._build_character_description(desc) 
            for name, desc in self.characters.items()
        }
        script_section = f"Here is the film script: \n{self.script}"
        characters_section = f"The characters in the script have the following descriptions: \n{json.dumps(character_descriptions, indent=2)}"
        instructions = f"""
    ### Storyboard Generation Instructions
    1. **Number of Scenes**: Divide the entire script into a reasonable number of scenes (typically between 4 to 7 scenes), not too many or too few.
    2. **Single Distinct Moment**: Each scene captures a single moment.
    3. **Camera Angles & Orientation**: Choose from these shot types: {', '.join(self.CAMERA_SHOTS)}.  
    Choose from these orientations: {', '.join(self.ORIENTATIONS)}.
    4. **Location & Time**: Clearly derive environment from the script (e.g. INT DAY, DON'S OFFICE, etc.). Describe it in its details (size, lighting, mood, organization of the objects, etc.). Notice that if it's the same across the different scenes, it must be written in the same way
    5. **Characters**:
    - List only characters relevant to the single moment in each scene.
    - Each character must have the name and a short description (consistent from provided descriptions).
    6. Clearly describe the scene including actions, character positions (foreground, background, left, right), emotions, and expressions.
    7. **Scene Format**: Return JSON with a key 'scenes' as an array of structured objects:
    - "scene_number": integer
    - "shot_type": camera shot type (from provided list) 
    - "orientation": orientation (from provided list)
    - "characters": list of objects with:
            - "name": character's name, not as they appear on the script but as they were given to you in the description.
    - "environment": short description of the location
    - "description": short, vivid description focusing on actions, expressions, emotions of each single character. Also their relative position is clearly described. The description must be succint, without extra articles or words, it should be visual and useful for an image generation prompt. Ensure it makes sense with the shot type (e.g., if it's medium shot, don't say that the face is covering the full image, otherwise it should be a close up). Don't write the words they say, since they occupy tokens, unless it's a fundamental part of the script. Avoid useless adjectives or adverbs, be concise and clear.

    Follow the above instructions very carefully. Notice that the scenes have no knowledge of each other's contents. So in case something is necessary, describe it again. 
    """

        example_input = """
    ### Example
    Input: 
    - Script is 
    INT DAY: DON'S OFFICE (SUMMER 1945)

            DON CORLEONE
    ACT LIKE A MAN!  By Christ in
    Heaven, is it possible you turned
    out no better than a Hollywood
    finocchio.

    Both HAGEN and JOHNNY cannot refrain from laughing.  The DON
    smiles.  SONNY enters as noiselessly as possible, still
    adjusting his clothes.

            DON CORLEONE
    All right, Hollywood...Now tell me
    about this Hollywood Pezzonovanta
    who won't let you work.

            JOHNNY
    He owns the studio.  Just a month
    ago he bought the movie rights to
    this book, a best seller.  And the
    main character is a guy just like
    me.  I wouldn't even have to act,
    just be myself.

    The DON is silent, stern.

            DON CORLEONE
    You take care of your family?

            JOHNNY
    Sure.

    He glances at SONNY, who makes himself as inconspicuous as
    he can.

            DON CORLEONE
    You look terrible.  I want you to
    eat well, to rest.  And spend time
    with your family.  And then, at the
    end of the month, this big shot
    will give you the part you want.

            JOHNNY
    It's too late.  All the contracts
    have been signed, they're almost
    ready to shoot.

            DON CORLEONE
    I'll make him an offer he can't
    refuse.

    He takes JOHNNY to the door, pinching his cheek hard enough
    to hurt.

            DON CORLEONE
    Now go back to the party and leave
    it to me.

    He closes the door, smiling to himself.  Turns to HAGEN.

            DON CORLEONE
    When does my daughter leave with
    her bridegroom?

            HAGEN
    They'll cut the cake in a few
    minutes...leave right after that.
    Your new son-in-law, do we give him
    something important?

            DON CORLEONE
    No, give him a living.  But never
    let him know the family's business.
    What else, Tom?

            HAGEN
    I've called the hospital; they've
    notified Consigliere Genco's family
    to come and wait.  He won't last
    out the night.

    This saddens the DON.  He sighs.

            DON CORLEONE
    Genco will wait for me.  Santino,
    tell your brothers they will come
    with me to the hospital to see
    Genco.  Tell Fredo to drive the big
    car, and ask Johnny to come with us.

            SONNY
    And Michael?

            DON CORLEONE
    All my sons.
            (to HAGEN)
    Tom, I want you to go to California
    tonight.  Make the arrangements.
    But don't leave until I come back
    from the hospital and speak to you.
    Understood?

            HAGEN
    Understood.

    - Characters description from the dictionary gives
            - Don Vito Corleone: 'Italian-American, early 60s, male, slicked-back gray-black hair, stocky, slightly hunched posture, wearing dark three-piece suit, with gold ring on right hand, pocket watch'
            - Johnny Fontane: 'late 30s, male, short, slicked-back black hair, clean shaven, slim and fit, wearing dark, stylish suit with an open collar, with gold ring, cigarette'
            - Tom Hagen: 'German-Irish, early 40s, male, short, neatly combed brown hair, clean-shaven, medium build, upright posture, wearing gray suit, dark tie'
            - Sonny: 'Italian-American, early 30s, male, curly, dark brown hair, clean-shaven, athletic build, wearing formal suit, slightly disheveled'
    """

        example_output = """
    Example Output:
    {
    "scenes": [
    {
    "scene_number": 1,
    "shot_type": "Medium Shot",
    "orientation": "Front View",
    "characters": [
            {
            "name": "Don Vito Corleone"
            },
            {
            "name": "Johnny Fontane"
            },
            {
            "name": "Tom Hagen"
            }
    ],
    "environment": "Don's office, daytime, summer 1945. Elegant wood-paneled room with large desk, leather chairs, warm lighting filtering through venetian blinds.",
    "description": "Don Corleone stands imposingly behind desk, face stern with righteous anger, pointing finger at Johnny. Johnny appears embarrassed, head slightly bowed. Hagen stands to the right, barely containing laughter. Tension and amusement mix in intimate office atmosphere."
    },
    {
    "scene_number": 2,
    "shot_type": "Two-Shot",
    "orientation": "Profile View",
    "characters": [
            {
            "name": "Don Vito Corleone"
            },
            {
            "name": "Johnny Fontane"
            },
            {
            "name": "Tom Hagen"
            },
            {
            "name": "Sonny"
            }
    ],
    "environment": "Don's office, daytime, summer 1945. Elegant wood-paneled room with large desk, leather chairs, warm lighting filtering through venetian blinds.",
    "description": "Sonny quietly enters room from right, adjusting disheveled clothes. Don leans forward at desk, expression softening to business-like focus. Johnny stands center, straightening posture. Hagen observes from left corner. Atmosphere shifts from personal rebuke to business discussion."
    },
    {
    "scene_number": 3,
    "shot_type": "Close-Up",
    "orientation": "Front View",
    "characters": [
            {
            "name": "Don Vito Corleone"
            }
    ],
    "environment": "Don's office, daytime, summer 1945. Elegant wood-paneled room with large desk, leather chairs, warm lighting filtering through venetian blinds.",
    "description": "Don Corleone's face fills frame, stern and contemplative. Eyes narrowed, jaw set firmly. Saying 'I'll make him an offer he can't refuse' with quiet, confident menace. Power and authority emanate from his expression."
    },
    {
    "scene_number": 4,
    "shot_type": "Medium Close-Up",
    "orientation": "Three-Quarters View",
    "characters": [
            {
            "name": "Don Vito Corleone"
            },
            {
            "name": "Johnny Fontane"
            }
    ],
    "environment": "Don's office, daytime, summer 1945. Elegant wood-paneled room with large desk, leather chairs, warm lighting filtering through venetian blinds.",
    "description": "Don Corleone escorts Johnny to door, pinching his cheek firmly. Don's expression shows affection mixed with dominance. Johnny winces slightly at pain while showing relief and gratitude. Door frame visible on right edge of shot."
    },
    {
    "scene_number": 5,
    "shot_type": "Medium Shot",
    "orientation": "Front View",
    "characters": [
            {
            "name": "Don Vito Corleone"
            },
            {
            "name": "Tom Hagen"
            }
    ],
    "environment": "Don's office, daytime, summer 1945. Elegant wood-paneled room with large desk, leather chairs, warm lighting filtering through venetian blinds.",
    "description": "Don Corleone turns from closed door, small smile fading to serious business expression. Hagen stands attentively near desk, notepad ready. Don moves toward chair, shoulders slightly hunched, gold ring catching light as he gestures."
    },
    {
    "scene_number": 6,
    "shot_type": "Over-the-Shoulder Shot",
    "orientation": "Profile View",
    "characters": [
            {
            "name": "Don Vito Corleone"
            },
            {
            "name": "Tom Hagen"
            },
            {
            "name": "Sonny"
            }
    ],
    "environment": "Don's office, daytime, summer 1945. Elegant wood-paneled room with large desk, leather chairs, warm lighting filtering through venetian blinds.",
    "description": "Camera over Don's shoulder, facing Hagen and Sonny. Don's gray-black hair and dark suit visible in foreground. Hagen's face shows respectful attention. Sonny stands beside him, now composed. Don's voice carries weight as he issues final instructions about hospital visit."
    }
    ]
    }
    """

        user_content = f"{script_section}\n\n{characters_section}\n\n{instructions}\n{example_input}\n{example_output}"
        
        messages = [
            {"role": "system", "content": (
                "You are an AI specialized in creating structured storyboard scenes from a film script "
                "for image generation (e.g., stable diffusion). Each scene must capture a single distinct moment, "
                "should list relevant characters with consistent appearances, specify the environment, camera shot, "
                "and orientation, and provide direct clues for a diffusion model to generate images."
                )},
            {"role": "user", "content": user_content}
        ]
        
        response = self.together.chat.completions.create(
            model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
            messages=messages,
            max_tokens=10000,
            temperature=self.temperature,
            response_format={"type": "json_object", "schema": SceneList.model_json_schema()}
        )

        try:
            output_json = response.choices[0].message.content
            self.scenes = json.loads(output_json)["scenes"]
            return self.scenes
        except (json.JSONDecodeError, KeyError) as e:
            logger.error("Error parsing JSON output: %s", e)
            return []
        
    def scenes_to_formatted_prompts(self) -> List[Tuple[List[str], List[float]]]:
        """
        Converts a list of scenes into structured diffusion model prompts with weights.
        
        Returns:
            List[Tuple[List[str], List[float]]]: Each tuple contains subprompt texts and their corresponding weights.
        """
        if self.formatted_prompts is not None:
            return self.formatted_prompts
        logger.info("Generating formatted prompts...")
        weight_map = {
            "style": self.prompt_weights[0],
            "environment": self.prompt_weights[1],
            "shot": self.prompt_weights[2],
            "description": self.prompt_weights[3]
        }
        character_weight = self.prompt_weights[4]

        style_value = ("rough b&w pencil sketch, simple sketch lines, minimal shading, rough hatching, draft-style, "
                       "J.C. Leyendecker style") if self.style == "storyboard" else self.style

        formatted_results: List[Tuple[List[str], List[float]]] = []
        if self.scenes is None:
            self.input_to_json()

        for scene in self.scenes:
            subprompts: Dict[str, str] = {}
            for i, char in enumerate(scene["characters"]):
                char_name = char["name"]
                char_info = self.characters.get(char_name)
                if not char_info:
                    matching_keys = [key for key in self.characters if char_name in key]
                    if matching_keys:
                        char_info = self.characters.get(matching_keys[0],
                                                        {"age": "unknown", "gender": "unknown", "hair": "unknown",
                                                         "clothing": "unknown", "body_type": "unknown"})
                    else:
                        char_info = {"age": "unknown", "gender": "unknown", "hair": "unknown", "clothing": "unknown", "body_type": "unknown"}
                char_desc = self._build_character_description(char_info)
                subprompts[f"character{i+1}"] = f"{char_name}: {char_desc}"
            
            subprompts["style"] = style_value
            subprompts["environment"] = scene["environment"]
            subprompts["shot"] = f"{scene['shot_type']}, {scene['orientation']}"
            subprompts["description"] = scene["description"]

            subprompt_texts: List[str] = []
            subprompt_weights: List[float] = []
            for key, text in subprompts.items():
                subprompt_texts.append(text)
                if key.startswith("character"):
                    subprompt_weights.append(character_weight)
                else:
                    subprompt_weights.append(weight_map.get(key, 1.0))
            formatted_results.append((subprompt_texts, subprompt_weights))
        self.formatted_prompts = formatted_results
        logger.info("Formatted prompts generated successfully.")
        return formatted_results
    
    def _save_image(self, image: Image.Image, image_path: str) -> None:
        """
        Saves the generated image to the specified path.
        """
        image.save(image_path)
        logger.info("Image saved to %s", image_path)
    
    # unique prompt
    def build_unique_prompts(self) -> List[str]:
        """
        Builds unique prompt strings for each scene by concatenating the style, shot, and description.
        
        Returns:
            List[str]: List of unique prompt strings.
        """
        if self.formatted_prompts is None:
            self.scenes_to_formatted_prompts()
        style_override = "rough b&w simple pencil sketch, J.C. Leyendecker style," if self.style == "storyboard" else self.style
        unique_prompts: List[str] = []
        for subprompt_texts, _ in self.formatted_prompts:
            if len(subprompt_texts) < 2:
                logger.error("Insufficient subprompt texts to build unique prompt.")
                continue
            # Assumes the penultimate text is the shot prompt and the last is the description.
            shot_prompt = subprompt_texts[-2]
            description = subprompt_texts[-1]
            unique_prompt = f"{style_override} {shot_prompt}: {description}"
            unique_prompts.append(unique_prompt)
        return unique_prompts
        
    def generate_and_save_images_unique_prompts(
        self,
        save_dir: str, 
        generation_type: str = "unique", 
        negative_prompt: str = "low quality, photorealistic, 3d render, overly detailed, digital art, painting, vibrant colors, fine art, NSFW", 
        num_inference_steps: int = 50, 
        guidance_scale: float = 7.5,
        )-> List[Image.Image]:
        """
        Generates images using unique prompts and saves them.
        
        Returns:
            List[Image.Image]: List of generated images.
        """
        pipe = self._get_pipeline(generation_type)  # generation_type should be "unique" here
        os.makedirs(save_dir, exist_ok=True)
        unique_prompts = self.build_unique_prompts()
        generated_images: List[Image.Image] = []
        for i, unique_prompt in enumerate(unique_prompts):
            with torch.no_grad():
                output = pipe(
                    prompt=unique_prompt,
                    negative_prompt=negative_prompt,
                    guidance_scale=guidance_scale,
                    num_inference_steps=num_inference_steps
                )
            generated_image = output.images[0]
            generated_images.append(generated_image)
            image_path = os.path.join(save_dir, f"image_{i+1}.png")
            self._save_image(generated_image, image_path)
        return generated_images
    
    # prompt weights
    def weighted_sum_prompt_embeddings(
        self, 
        subprompt_texts: List[str], 
        subprompt_weights: List[float], 
        num_images_per_prompt: int = 1
    ) -> torch.Tensor:
        """
        Computes a weighted sum of text embeddings for a list of subprompts.
        
        Returns:
            torch.Tensor: Combined prompt embeddings.
        """
        encoded_prompts = []
        for text in subprompt_texts:
            text_inputs = self.pipe.tokenizer(
                text,
                padding="max_length",
                max_length=self.pipe.tokenizer.model_max_length,
                truncation=True,
                return_tensors="pt",
            )
            input_ids = text_inputs.input_ids.to(self.device)
            attention_mask = text_inputs.attention_mask.to(self.device) if "attention_mask" in text_inputs else None
            text_embeds = self.pipe.text_encoder(input_ids, attention_mask=attention_mask)[0]
            encoded_prompts.append(text_embeds)
        weighted_embedding = sum(weight * embeds for weight, embeds in zip(subprompt_weights, encoded_prompts))
        weight_total = sum(subprompt_weights)
        combined_embedding = weighted_embedding / weight_total
        batch_size, seq_len, embed_dim = combined_embedding.shape
        combined_embedding = combined_embedding.repeat(1, num_images_per_prompt, 1)
        combined_embedding = combined_embedding.view(batch_size * num_images_per_prompt, seq_len, embed_dim)
        return combined_embedding
    
    def generate_and_save_images_prompt_weights(
        self, 
        save_dir: str, 
        generation_type: str = "prompt_weights", 
        negative_prompt: str = "low quality, photorealistic, 3d render, overly detailed, digital art, painting, vibrant colors, fine art, NSFW", 
        num_inference_steps: int = 50, 
        guidance_scale: float = 7.5,
    ) -> List[Image.Image]:
        """
        Generates images using weighted prompt embeddings and saves them.
        """
        pipe = self._get_pipeline(generation_type)  # generation_type should be "prompt_weights" here
        os.makedirs(save_dir, exist_ok=True)
        generated_images: List[Image.Image] = []
        formatted_prompts = self.scenes_to_formatted_prompts()
        for i, (subprompt_texts, subprompt_weights) in enumerate(formatted_prompts):
            combined_embeddings = self.weighted_sum_prompt_embeddings(subprompt_texts, subprompt_weights)
            with torch.no_grad():
                output = pipe(
                    prompt_embeds=combined_embeddings,
                    negative_prompt=negative_prompt,
                    guidance_scale=guidance_scale,
                    num_inference_steps=num_inference_steps
                )
            generated_image = output.images[0]
            generated_images.append(generated_image)
            image_path = os.path.join(save_dir, f"image_{i+1}.png")
            self._save_image(generated_image, image_path)
        return generated_images
    
    # modified-cfg
    def encode_subprompt(self, text: str) -> torch.Tensor:
        """
        Tokenizes and encodes a single subprompt into an embedding.
        """
        text_inputs = self.pipe.tokenizer(
            text,
            padding="max_length",
            max_length=self.pipe.tokenizer.model_max_length,
            truncation=True,
            return_tensors="pt",
        )
        text_embeds = self.pipe.text_encoder(
            text_inputs.input_ids.to(self.device),
            attention_mask=text_inputs.attention_mask.to(self.device)
        )[0]
        return text_embeds
        
    def generate_and_save_images_multi_prompt(
        self, 
        save_dir: str, 
        generation_type: str = "modified-cfg", 
        negative_prompt: str = "low quality, photorealistic, 3d render, overly detailed, digital art, painting, vibrant colors, fine art, NSFW", 
        num_inference_steps: int = 50, 
        guidance_scale: float = 7.5,
    ) -> List[Image.Image]:
        """
        Generates images using the Multi-Prompt pipeline and saves them.
        """
        pipe = self._get_pipeline(generation_type)  # generation_type should be "modified-cfg" here
        os.makedirs(save_dir, exist_ok=True)
        generated_images: List[Image.Image] = []
        uncond_embeds = self.encode_subprompt(negative_prompt)
        formatted_prompts = self.scenes_to_formatted_prompts()
        for i, (subprompt_texts, subprompt_weights) in enumerate(formatted_prompts):
            subprompt_embeds = [self.encode_subprompt(sp) for sp in subprompt_texts]
            logger.info("Generating image for scene %d...", i+1)
            with torch.no_grad():
                output = pipe(
                    subprompt_embeds=subprompt_embeds,
                    subprompt_weights=subprompt_weights,
                    uncond_embeds=uncond_embeds,
                    guidance_scale=guidance_scale,
                    num_inference_steps=num_inference_steps
                )
            generated_image = output.images[0]
            generated_images.append(generated_image)
            image_path = os.path.join(save_dir, f"image_{i+1}.png")
            self._save_image(generated_image, image_path)
        return generated_images
    
    def generate_and_save_images(
        self, 
        save_dir: str, 
        generation_type: str, 
        negative_prompt: str = "low quality, photorealistic, 3d render, overly detailed, digital art, painting, vibrant colors, fine art, NSFW", 
        num_inference_steps: int = 50, 
        guidance_scale: float = 7.5,
    ) -> List[Image.Image]:
        """
        Unified method that generates and saves images based on the provided generation type.
        """
        if generation_type == "unique":
            return self.generate_and_save_images_unique_prompts(save_dir, generation_type, negative_prompt, num_inference_steps, guidance_scale)
        elif generation_type == "prompt_weights":
            return self.generate_and_save_images_prompt_weights(save_dir, generation_type, negative_prompt, num_inference_steps, guidance_scale)
        elif generation_type == "modified-cfg":
            return self.generate_and_save_images_multi_prompt(save_dir, generation_type, negative_prompt, num_inference_steps, guidance_scale)
        else:
            raise ValueError(f"Unsupported generation type: {generation_type}")
        
    def generate_and_save_prompts_txt(self, save_dir: str, generation_type: str) -> None:
        os.makedirs(save_dir, exist_ok=True)
        file_path = os.path.join(save_dir, "prompts.txt")
        lines = []
        if generation_type == "unique":
            unique_prompts = self.build_unique_prompts()
            for i, prompt in enumerate(unique_prompts):
                lines.append(f"Scene {i+1} Unique Prompt:\n{prompt}\n")
        else:
            # For prompt_weights and modified-cfg, save subprompts breakdown.
            if self.formatted_prompts is None:
                self.scenes_to_formatted_prompts()
            for i, (subprompt_texts, subprompt_weights) in enumerate(self.formatted_prompts):
                lines.append(f"Scene {i+1} Subprompts:")
                for j, text in enumerate(subprompt_texts):
                    weight = subprompt_weights[j]
                    lines.append(f"  Subprompt {j+1} (weight {weight}): {text}")
                lines.append("")
        with open(file_path, "w", encoding="utf-8") as f:
            f.write("\n".join(lines))
        logger.info("Prompts saved to %s", file_path)
        
    def generate_and_save(
        self, 
        save_dir: str, 
        generation_type: str, 
        negative_prompt: str = "low quality, photorealistic, 3d render, overly detailed, digital art, painting, vibrant colors, fine art, NSFW", 
        num_inference_steps: int = 50, 
        guidance_scale: float = 7.5,
    ) -> None:
        """
        Unified method that generates and saves images and prompt text.
        """
        self.generate_and_save_images(save_dir, generation_type, negative_prompt, num_inference_steps, guidance_scale)
        self.generate_and_save_prompts_txt(save_dir, generation_type)
        logger.info("Storyboard generation completed successfully.")
        
    def __repr__(self) -> str:
        return (
            f"StoryboardGenerator(script={self.script[:50]}..., "
            f"characters={list(self.characters.keys())}, "
            f"style={self.style}, "
            f"prompt_weights={self.prompt_weights}, "
            f"temperature={self.temperature}, "
            f"device={self.device}, "
            f"seed={self.seed})"
        )

In [4]:
characters_dict = {
    "Don Vito Corleone": {
        "age": "early 60s", "gender": "male", "hair": "slicked-back gray-black hair",
        "clothing": "dark three-piece suit",
        "body_type": "stocky, slightly hunched posture",
        "accessories": "gold ring on right hand, pocket watch",
        "ethnicity": "Italian-American"
    },
    "Tom Hagen": {
        "age": "early 40s", "gender": "male", "hair": "short, neatly combed brown hair",
        "facial_hair": "clean-shaven", "clothing": "gray suit, dark tie",
        "body_type": "medium build, upright posture", "ethnicity": "German-Irish"
    },
    "Johnny Fontane": {
        "age": "late 30s", "gender": "male", "hair": "short, slicked-back black hair",
        "facial_hair": "clean shaven", "clothing": "dark, stylish suit with an open collar",
        "body_type": "slim and fit", "accessories": "gold ring, cigarette"
    },
    "Sonny": {
        "age": "early 30s", "gender": "male", "hair": "curly, dark brown hair",
        "facial_hair": "clean-shaven", "clothing": "formal suit, slightly disheveled",
        "body_type": "athletic build", "ethnicity": "Italian-American",
    }

}

script = """
INT DAY: DON'S OFFICE (SUMMER 1945)

				DON CORLEONE
		ACT LIKE A MAN!  By Christ in
		Heaven, is it possible you turned
		out no better than a Hollywood
		finocchio.

	Both HAGEN and JOHNNY cannot refrain from laughing.  The DON
	smiles.  SONNY enters as noiselessly as possible, still
	adjusting his clothes.

				DON CORLEONE
		All right, Hollywood...Now tell me
		about this Hollywood Pezzonovanta
		who won't let you work.

				JOHNNY
		He owns the studio.  Just a month
		ago he bought the movie rights to
		this book, a best seller.  And the
		main character is a guy just like
		me.  I wouldn't even have to act,
		just be myself.

	The DON is silent, stern.

				DON CORLEONE
		You take care of your family?

				JOHNNY
		Sure.

	He glances at SONNY, who makes himself as inconspicuous as
	he can.

				DON CORLEONE
		You look terrible.  I want you to
		eat well, to rest.  And spend time
		with your family.  And then, at the
		end of the month, this big shot
		will give you the part you want.

				JOHNNY
		It's too late.  All the contracts
		have been signed, they're almost
		ready to shoot.

				DON CORLEONE
		I'll make him an offer he can't
		refuse.

	He takes JOHNNY to the door, pinching his cheek hard enough
	to hurt.

				DON CORLEONE
		Now go back to the party and leave
		it to me.

	He closes the door, smiling to himself.  Turns to HAGEN.

				DON CORLEONE
		When does my daughter leave with
		her bridegroom?

				HAGEN
		They'll cut the cake in a few
		minutes...leave right after that.
		Your new son-in-law, do we give him
		something important?

				DON CORLEONE
		No, give him a living.  But never
		let him know the family's business.
		What else, Tom?

				HAGEN
		I've called the hospital; they've
		notified Consiglere Genco's family
		to come and wait.  He won't last
		out the night.

	This saddens the DON.  He sighs.

				DON CORLEONE
		Genco will wait for me.  Santino,
		tell your brothers they will come
		with me to the hospital to see
		Genco.  Tell Fredo to drive the big
		car, and ask Johnny to come with us.

				SONNY
		And Michael?

				DON CORLEONE
		All my sons.
			  (to HAGEN)
		Tom, I want you to go to California
		tonight.  Make the arrangements.
		But don't leave until I come back
		from the hospital and speak to you.
		Understood?

				HAGEN
		Understood.
"""

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [6]:
generator = StoryboardGenerator(script, characters_dict, device=device)

In [7]:
generator.generate_and_save(save_dir="unique", generation_type="unique")

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

2025-03-23 01:54:16,828 INFO: Generating formatted prompts...
2025-03-23 01:54:34,614 INFO: Formatted prompts generated successfully.


  0%|          | 0/50 [00:00<?, ?it/s]

2025-03-23 01:54:55,612 INFO: Image saved to unique\image_1.png
Token indices sequence length is longer than the specified maximum sequence length for this model (81 > 77). Running this sequence through the model will result in indexing errors
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['to business discussion .']


  0%|          | 0/50 [00:00<?, ?it/s]

2025-03-23 01:55:15,186 INFO: Image saved to unique\image_2.png


  0%|          | 0/50 [00:00<?, ?it/s]

Potential NSFW content was detected in one or more images. A black image will be returned instead. Try again with a different prompt and/or seed.
2025-03-23 01:55:36,702 INFO: Image saved to unique\image_3.png


  0%|          | 0/50 [00:00<?, ?it/s]

2025-03-23 01:55:58,793 INFO: Image saved to unique\image_4.png


  0%|          | 0/50 [00:00<?, ?it/s]

2025-03-23 01:56:21,863 INFO: Image saved to unique\image_5.png
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['instructions about hospital visit .']


  0%|          | 0/50 [00:00<?, ?it/s]

2025-03-23 01:56:45,684 INFO: Image saved to unique\image_6.png


  0%|          | 0/50 [00:00<?, ?it/s]

2025-03-23 01:57:10,200 INFO: Image saved to unique\image_7.png


  0%|          | 0/50 [00:00<?, ?it/s]

2025-03-23 01:57:34,499 INFO: Image saved to unique\image_8.png


  0%|          | 0/50 [00:00<?, ?it/s]

2025-03-23 01:57:59,050 INFO: Image saved to unique\image_9.png


  0%|          | 0/50 [00:00<?, ?it/s]

2025-03-23 01:58:24,654 INFO: Image saved to unique\image_10.png


  0%|          | 0/50 [00:00<?, ?it/s]

2025-03-23 01:58:50,718 INFO: Image saved to unique\image_11.png


  0%|          | 0/50 [00:00<?, ?it/s]

2025-03-23 01:59:17,273 INFO: Image saved to unique\image_12.png
2025-03-23 01:59:17,276 INFO: Prompts saved to unique\prompts.txt
2025-03-23 01:59:17,276 INFO: Storyboard generation completed successfully.


In [8]:
generator.generate_and_save(save_dir="prompt_weights", generation_type="prompt_weights")

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

2025-03-23 02:00:40,351 INFO: Image saved to prompt_weights\image_1.png


  0%|          | 0/50 [00:00<?, ?it/s]

2025-03-23 02:01:01,686 INFO: Image saved to prompt_weights\image_2.png


  0%|          | 0/50 [00:00<?, ?it/s]

2025-03-23 02:01:24,876 INFO: Image saved to prompt_weights\image_3.png


  0%|          | 0/50 [00:00<?, ?it/s]

2025-03-23 02:01:49,511 INFO: Image saved to prompt_weights\image_4.png


  0%|          | 0/50 [00:00<?, ?it/s]

2025-03-23 02:02:15,246 INFO: Image saved to prompt_weights\image_5.png


  0%|          | 0/50 [00:00<?, ?it/s]

2025-03-23 02:02:41,154 INFO: Image saved to prompt_weights\image_6.png


  0%|          | 0/50 [00:00<?, ?it/s]

2025-03-23 02:03:07,764 INFO: Image saved to prompt_weights\image_7.png


  0%|          | 0/50 [00:00<?, ?it/s]

2025-03-23 02:03:34,245 INFO: Image saved to prompt_weights\image_8.png


  0%|          | 0/50 [00:00<?, ?it/s]

2025-03-23 02:04:01,476 INFO: Image saved to prompt_weights\image_9.png


  0%|          | 0/50 [00:00<?, ?it/s]

2025-03-23 02:04:28,728 INFO: Image saved to prompt_weights\image_10.png


  0%|          | 0/50 [00:00<?, ?it/s]

2025-03-23 02:04:56,155 INFO: Image saved to prompt_weights\image_11.png


  0%|          | 0/50 [00:00<?, ?it/s]

2025-03-23 02:05:23,733 INFO: Image saved to prompt_weights\image_12.png
2025-03-23 02:05:23,740 INFO: Prompts saved to prompt_weights\prompts.txt
2025-03-23 02:05:23,741 INFO: Storyboard generation completed successfully.


In [9]:
generator.generate_and_save(save_dir="modified-cfg", generation_type="modified-cfg")

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

2025-03-23 02:05:34,559 INFO: Generating image for scene 1...
2025-03-23 02:07:25,643 INFO: Image saved to modified-cfg\image_1.png
2025-03-23 02:07:25,828 INFO: Generating image for scene 2...
2025-03-23 02:09:21,488 INFO: Image saved to modified-cfg\image_2.png
2025-03-23 02:09:21,714 INFO: Generating image for scene 3...
2025-03-23 02:10:52,393 INFO: Image saved to modified-cfg\image_3.png
2025-03-23 02:10:52,557 INFO: Generating image for scene 4...
2025-03-23 02:12:30,974 INFO: Image saved to modified-cfg\image_4.png
2025-03-23 02:12:31,152 INFO: Generating image for scene 5...
2025-03-23 02:14:14,676 INFO: Image saved to modified-cfg\image_5.png
2025-03-23 02:14:14,850 INFO: Generating image for scene 6...
2025-03-23 02:16:10,601 INFO: Image saved to modified-cfg\image_6.png
2025-03-23 02:16:10,773 INFO: Generating image for scene 7...
2025-03-23 02:17:55,524 INFO: Image saved to modified-cfg\image_7.png
2025-03-23 02:17:55,681 INFO: Generating image for scene 8...
2025-03-23 02:

# Old

In [None]:
# from dotenv import load_dotenv
# import json
# from together import Together
# from pydantic import BaseModel
# from typing import List, Dict
# load_dotenv()
# together = Together() # add .env file with TOGETHER_API_KEY variable
# ORIENTATIONS = [
#     "Front View", "Profile View", "Back View", "From Behind", "From Above",
#     "From Below", "Three-Quarters View", "Long Shot", "Three-Quarters Rear View"
# ]

# CAMERA_SHOTS = [
#     "Aerial View", "Bird’s-Eye View", "Close-Up", "Cowboy Shot", "Dolly Zoom",
#     "Dutch Angle", "Establishing Shot", "Extreme Close-Up", "Extreme Long Shot",
#     "Full Shot", "Long Shot", "Medium Close-Up", "Medium Long Shot", "Medium Shot",
#     "Over-the-Shoulder Shot", "Point-of-View Shot", "Two-Shot", "Fisheye Shot",
#     "Worm's Eye", "Low-Angle Shot", "Macro Shot", "Tilt-Shift Shot", "Telephoto Shot"
# ]

# class Character(BaseModel):
#     name: str
    
# class Scene(BaseModel):
#     scene_number: int
#     shot_type: str
#     orientation: str
#     characters: List[Character]
#     environment: str
#     description: str
    
# class SceneList(BaseModel):
#     scenes: List[Scene]

# def _build_character_description(characters_dict: Dict[str, str]):
#     """
#     Generates text description of character from the dictionary
#     """
#     features = [
#         characters_dict.get("ethnicity", ""),
#         characters_dict.get("age", ""),
#         characters_dict.get("gender", ""),
#         characters_dict.get("hair", ""),
#         characters_dict.get("facial_hair", ""),
#         characters_dict.get("body_type", ""),
#         f"wearing {characters_dict.get('clothing', '')}",
#         f"with {characters_dict.get('accessories', '')}" if characters_dict.get("accessories") else ""
#     ]
    
#     return ", ".join(filter(None, features))

# def input_to_json(script: str, characters: Dict[str, Dict], temperature: int = 0.7):
#     """
#     Converts a script and character descriptions into a JSON format for storyboard generation.
#     """
#     character_descriptions = {name: _build_character_description(desc) for name, desc in characters.items()}
    
#     script_section = f"Here is the film script: \n{script}"
#     characters_section = f"The characters in the script have the following descriptions: \n{json.dumps(character_descriptions, indent=2)}"
    
#     instructions = """
# ### Storyboard Generation Instructions
# 1. **Number of Scenes**: Divide the entire script into a reasonable number of scenes (typically between 4 to 7 scenes), not too many or too few.
# 2. **Single Distinct Moment**: Each scene captures a single moment.
# 3. **Camera Angles & Orientation**: Choose from these shot types: {', '.join(self.CAMERA_SHOTS)}.  
# Choose from these orientations: {', '.join(self.ORIENTATIONS)}.
# 4. **Location & Time**: Clearly derive environment from the script (e.g. INT DAY, DON'S OFFICE, etc.). Describe it in its details (size, lighhting, mood, organization of the objects, etc.). Notice that if it's the same across the different scenes, it must be written in the same way
# 5. **Characters**:
# - List only characters relevant to the single moment in each scene.
# - Each character must have the name and a short description (consistent from provided descriptions).
# 6. Clearly describe the scene including actions, character positions (foreground, background, left, right), emotions, and expressions.
# 7. **Scene Format**: Return JSON with a key 'scenes' as an array of structured objects:
# - "scene_number": integer
# - "shot_type": camera shot type (from provided list) 
# - "orientation": orientation (from provided list)
# - "characters": list of objects with:
#         - "name": character's name, not as they appear on the script but as they were given to you in the description.
# - "environment": short description of the location
# - "description": short, vivid description focusing on actions, expressions, emotions of each single character. Also their relative position is clearly described. The description must be succint, without extra articles or words, it should be visual and useful for an image generation prompt. Ensure it makes sense with the shot type (e.g., if it's medium shot, don't say that the face is covering the full image, otherwise it should be a close up). Don't write the words they say, since they occupy tokens, unless it's a fundamental part of the script. Avoid useless adjetives or adverbs, be concise and clear.

# Follow the above instructions very carefully. Notice that the scenes have no knowledge of each other's contents. So in case something is necessary, describe it again. 
# """

#     example_input = """
# ### Example
# Input: 
# - Script is 
# INT DAY: DON'S OFFICE (SUMMER 1945)

#         DON CORLEONE
# ACT LIKE A MAN!  By Christ in
# Heaven, is it possible you turned
# out no better than a Hollywood
# finocchio.

# Both HAGEN and JOHNNY cannot refrain from laughing.  The DON
# smiles.  SONNY enters as noiselessly as possible, still
# adjusting his clothes.

#         DON CORLEONE
# All right, Hollywood...Now tell me
# about this Hollywood Pezzonovanta
# who won't let you work.

#         JOHNNY
# He owns the studio.  Just a month
# ago he bought the movie rights to
# this book, a best seller.  And the
# main character is a guy just like
# me.  I wouldn't even have to act,
# just be myself.

# The DON is silent, stern.

#         DON CORLEONE
# You take care of your family?

#         JOHNNY
# Sure.

# He glances at SONNY, who makes himself as inconspicuous as
# he can.

#         DON CORLEONE
# You look terrible.  I want you to
# eat well, to rest.  And spend time
# with your family.  And then, at the
# end of the month, this big shot
# will give you the part you want.

#         JOHNNY
# It's too late.  All the contracts
# have been signed, they're almost
# ready to shoot.

#         DON CORLEONE
# I'll make him an offer he can't
# refuse.

# He takes JOHNNY to the door, pinching his cheek hard enough
# to hurt.

#         DON CORLEONE
# Now go back to the party and leave
# it to me.

# He closes the door, smiling to himself.  Turns to HAGEN.

#         DON CORLEONE
# When does my daughter leave with
# her bridegroom?

#         HAGEN
# They'll cut the cake in a few
# minutes...leave right after that.
# Your new son-in-law, do we give him
# something important?

#         DON CORLEONE
# No, give him a living.  But never
# let him know the family's business.
# What else, Tom?

#         HAGEN
# I've called the hospital; they've
# notified Consigliere Genco's family
# to come and wait.  He won't last
# out the night.

# This saddens the DON.  He sighs.

#         DON CORLEONE
# Genco will wait for me.  Santino,
# tell your brothers they will come
# with me to the hospital to see
# Genco.  Tell Fredo to drive the big
# car, and ask Johnny to come with us.

#         SONNY
# And Michael?

#         DON CORLEONE
# All my sons.
#         (to HAGEN)
# Tom, I want you to go to California
# tonight.  Make the arrangements.
# But don't leave until I come back
# from the hospital and speak to you.
# Understood?

#         HAGEN
# Understood.

# - Characters description from the dictionary gives
#         - Don Vito Corleone: 'Italian-American, early 60s, male, slicked-back gray-black hair, stocky, slightly hunched posture, wearing dark three-piece suit, with gold ring on right hand, pocket watch'
#         - Johnny Fontane: 'late 30s, male, short, slicked-back black hair, clean shaven, slim and fit, wearing dark, stylish suit with an open collar, with gold ring, cigarette'
#         - Tom Hagen: 'German-Irish, early 40s, male, short, neatly combed brown hair, clean-shaven, medium build, upright posture, wearing gray suit, dark tie'
#         - Sonny: 'Italian-American, early 30s, male, curly, dark brown hair, clean-shaven, athletic build, wearing formal suit, slightly disheveled'
# """

#     example_output = """
# Example Output:
# {
#    "scenes": [
#    {
#    "scene_number": 1,
#    "shot_type": "Medium Shot",
#    "orientation": "Front View",
#    "characters": [
#            {
#            "name": "Don Vito Corleone"
#            },
#            {
#            "name": "Johnny Fontane"
#            },
#            {
#            "name": "Tom Hagen"
#            }
#    ],
#    "environment": "Don's office, daytime, summer 1945. Elegant wood-paneled room with large desk, leather chairs, warm lighting filtering through venetian blinds.",
#    "description": "Don Corleone stands imposingly behind desk, face stern with righteous anger, pointing finger at Johnny. Johnny appears embarrassed, head slightly bowed. Hagen stands to the right, barely containing laughter. Tension and amusement mix in intimate office atmosphere."
#    },
#    {
#    "scene_number": 2,
#    "shot_type": "Two-Shot",
#    "orientation": "Profile View",
#    "characters": [
#            {
#            "name": "Don Vito Corleone"
#            },
#            {
#            "name": "Johnny Fontane"
#            },
#            {
#            "name": "Tom Hagen"
#            },
#            {
#            "name": "Sonny"
#            }
#    ],
#    "environment": "Don's office, daytime, summer 1945. Elegant wood-paneled room with large desk, leather chairs, warm lighting filtering through venetian blinds.",
#    "description": "Sonny quietly enters room from right, adjusting disheveled clothes. Don leans forward at desk, expression softening to business-like focus. Johnny stands center, straightening posture. Hagen observes from left corner. Atmosphere shifts from personal rebuke to business discussion."
#    },
#    {
#    "scene_number": 3,
#    "shot_type": "Close-Up",
#    "orientation": "Front View",
#    "characters": [
#            {
#            "name": "Don Vito Corleone"
#            }
#    ],
#    "environment": "Don's office, daytime, summer 1945. Elegant wood-paneled room with large desk, leather chairs, warm lighting filtering through venetian blinds.",
#    "description": "Don Corleone's face fills frame, stern and contemplative. Eyes narrowed, jaw set firmly. Saying 'I'll make him an offer he can't refuse' with quiet, confident menace. Power and authority emanate from his expression."
#    },
#    {
#    "scene_number": 4,
#    "shot_type": "Medium Close-Up",
#    "orientation": "Three-Quarters View",
#    "characters": [
#            {
#            "name": "Don Vito Corleone"
#            },
#            {
#            "name": "Johnny Fontane"
#            }
#    ],
#    "environment": "Don's office, daytime, summer 1945. Elegant wood-paneled room with large desk, leather chairs, warm lighting filtering through venetian blinds.",
#    "description": "Don Corleone escorts Johnny to door, pinching his cheek firmly. Don's expression shows affection mixed with dominance. Johnny winces slightly at pain while showing relief and gratitude. Door frame visible on right edge of shot."
#    },
#    {
#    "scene_number": 5,
#    "shot_type": "Medium Shot",
#    "orientation": "Front View",
#    "characters": [
#            {
#            "name": "Don Vito Corleone"
#            },
#            {
#            "name": "Tom Hagen"
#            }
#    ],
#    "environment": "Don's office, daytime, summer 1945. Elegant wood-paneled room with large desk, leather chairs, warm lighting filtering through venetian blinds.",
#    "description": "Don Corleone turns from closed door, small smile fading to serious business expression. Hagen stands attentively near desk, notepad ready. Don moves toward chair, shoulders slightly hunched, gold ring catching light as he gestures."
#    },
#    {
#    "scene_number": 6,
#    "shot_type": "Over-the-Shoulder Shot",
#    "orientation": "Profile View",
#    "characters": [
#            {
#            "name": "Don Vito Corleone"
#            },
#            {
#            "name": "Tom Hagen"
#            },
#            {
#            "name": "Sonny"
#            }
#    ],
#    "environment": "Don's office, daytime, summer 1945. Elegant wood-paneled room with large desk, leather chairs, warm lighting filtering through venetian blinds.",
#    "description": "Camera over Don's shoulder, facing Hagen and Sonny. Don's gray-black hair and dark suit visible in foreground. Hagen's face shows respectful attention. Sonny stands beside him, now composed. Don's voice carries weight as he issues final instructions about hospital visit."
#    }
#    ]
# }
# """

#     # Combine all content together without nesting f-strings
#     user_content = f"{script_section}\n\n{characters_section}\n\n{instructions}\n{example_input}\n{example_output}"
    
#     messages = [
#         {"role": "system", "content": (
#             "You are an AI specialized in creating structured storyboard scenes from a film script "
#             "for image generation (e.g., stable diffusion). Each scene must capture a single distinct moment, "
#             "should list relevant characters with consistent appearances, specify the environment, camera shot, "
#             "and orientation, and provide direct clues for a diffusion model to generate images."
#             )},
#         {"role": "user", "content": user_content}
#     ]
    
#     response = together.chat.completions.create(
#         model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
#         messages=messages,
#         max_tokens=10000,
#         temperature=temperature,
#         response_format={"type": "json_object", "schema": SceneList.model_json_schema()}
#     )

#     try:
#         output_json = response.choices[0].message.content
#         return json.loads(output_json)["scenes"]
#     except (json.JSONDecodeError, KeyError) as e:
#         print("Error parsing JSON output:", e)
#         return []

# scenes = input_to_json(script, characters_dict)
# print(json.dumps(scenes, indent=4))

# def scenes_to_formatted_prompts(scenes, characters_dict, style="storyboard", prompt_weights=[2, 1.0, 1.2, 1.5, 0.9]):
#     """
#     Converts a list of scenes into structured diffusion model prompts with weights in one pass,
#     with a fallback for character names not matching exactly.
    
#     Parameters:
#     - scenes (list): List of scene dictionaries.
#     - characters_dict (dict): Dictionary with character details.
#     - style (str): Artistic style string (default "storyboard").
#     - prompt_weights (list): Weights for "style", "environment", "shot", "description", and characters.
    
#     Returns:
#     - List of tuples (subprompt_texts, subprompt_weights) for each scene.
#     """
#     # Define weight mapping for non-character keys
#     weight_map = {
#         "style": prompt_weights[0],
#         "environment": prompt_weights[1],
#         "shot": prompt_weights[2],
#         "description": prompt_weights[3]
#     }
#     character_weight = prompt_weights[4]

#     # Determine style string based on input style
#     if style == "storyboard":
#         style_value = "rough b&w pencil sketch, simple sketch lines, minimal shading, rough hatching, draft-style, J.C. Leyendecker style"
#     else:
#         style_value = style

#     formatted_results = []

#     for scene in scenes:
#         subprompts = {}

#         # Add each character's prompt with fallback handling
#         for i, char in enumerate(scene["characters"]):
#             char_name = char["name"]
#             char_info = characters_dict.get(char_name)
            
#             # If not found, try to find a key that contains the given name as a substring
#             if not char_info:
#                 matching_keys = [key for key in characters_dict if char_name in key]
#                 if matching_keys:
#                     char_info = characters_dict[matching_keys[0]]
#                 else:
#                     # Provide a default description if still not found
#                     char_info = {"age": "unknown", "gender": "unknown", "hair": "unknown",
#                                  "clothing": "unknown", "body_type": "unknown"}
            
#             char_desc = _build_character_description(char_info)
#             subprompts[f"character{i+1}"] = f"{char_name}: {char_desc}"
        
#         # Add other scene details
#         subprompts["style"] = style_value
#         subprompts["environment"] = scene["environment"]
#         subprompts["shot"] = f"{scene['shot_type']}, {scene['orientation']}"
#         subprompts["description"] = scene["description"]

#         # Prepare lists for texts and weights
#         subprompt_texts = []
#         subprompt_weights = []
#         for key, text in subprompts.items():
#             subprompt_texts.append(text)
#             if key.startswith("character"):
#                 subprompt_weights.append(character_weight)
#             else:
#                 subprompt_weights.append(weight_map.get(key, 1.0))
        
#         formatted_results.append((subprompt_texts, subprompt_weights))
    
#     return formatted_results

# formatted_prompts = scenes_to_formatted_prompts(scenes, characters_dict)
# print(formatted_prompts)

## Unique Prompt

In [None]:
# import os
# import torch
# from diffusers import StableDiffusionPipeline, UniPCMultistepScheduler

# device = "cuda" if torch.cuda.is_available() else "cpu"
# pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
# pipe = pipe.to(device)
# pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
# pipe.enable_model_cpu_offload()
# pipe.enable_attention_slicing()

# def build_unique_prompts(formatted_prompts, style_override="rough b&w simple pencil sketch, J.C. Leyendecker style,"):
#     """
#     Given formatted prompts (a list of tuples where each tuple is 
#     (subprompt_texts, subprompt_weights)), build a unique prompt for each scene
#     by concatenating the style, shot prompt, and description.
    
#     Returns:
#         List[str]: Unique prompt strings for each scene.
#     """
#     unique_prompts = []
#     for subprompt_texts, _ in formatted_prompts:
#         shot_prompt = subprompt_texts[-2]  
#         description = subprompt_texts[-1]
#         unique_prompt = f"{style_override} {shot_prompt}: {description}"
#         unique_prompts.append(unique_prompt)
#     return unique_prompts

# def generate_and_save_images_unique_prompts(formatted_prompts, pipe, save_dir, device,
#                                             negative_prompt="low quality, photorealistic, 3d render, overly detailed, digital art, painting, vibrant colors, fine art, NSFW",
#                                             num_inference_steps=50):
#     """
#     Generate images using unique prompts built from formatted_prompts.
#     """
#     os.makedirs(save_dir, exist_ok=True)
#     unique_prompts = build_unique_prompts(formatted_prompts)
#     generated_images = []
    
#     for i, unique_prompt in enumerate(unique_prompts):
#         with torch.no_grad():
#             output = pipe(prompt=unique_prompt,
#                           negative_prompt=negative_prompt,
#                           num_inference_steps=num_inference_steps)
#         generated_image = output.images[0]
#         generated_images.append(generated_image)
#         image_path = os.path.join(save_dir, f"image_{i+1}.png")
#         generated_image.save(image_path)
#         print(f"Image {i+1} saved to {image_path}")
    
#     return generated_images

# save_directory = "stories/unique_prompts"
# generated_images = generate_and_save_images_unique_prompts(formatted_prompts, pipe, save_directory, device)

## Subprompt Weights

In [None]:
# import os
# import torch
# from diffusers import StableDiffusionPipeline, UniPCMultistepScheduler

# device = "cuda" if torch.cuda.is_available() else "cpu"
# pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
# pipe = pipe.to(device)
# pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
# pipe.enable_model_cpu_offload()
# pipe.enable_attention_slicing()

# def weighted_sum_prompt_embeddings(pipe, subprompt_texts, subprompt_weights, device, num_images_per_prompt=1):
#     """
#     Computes a weighted sum of text embeddings for a list of subprompts.
    
#     Args:
#         pipe: The Stable Diffusion pipeline instance.
#         subprompt_texts (List[str]): List of subprompt strings.
#         subprompt_weights (List[float]): Corresponding weights for each subprompt.
#         device (str): The device to run on ("cuda" or "cpu").
#         num_images_per_prompt (int): Number of images to generate per prompt.
    
#     Returns:
#         torch.Tensor: Combined prompt embeddings of shape (batch_size * num_images_per_prompt, seq_len, embed_dim)
#     """
#     encoded_prompts = []
#     for text in subprompt_texts:
#         # Tokenize the subprompt text
#         text_inputs = pipe.tokenizer(
#             text,
#             padding="max_length",
#             max_length=pipe.tokenizer.model_max_length,
#             truncation=True,
#             return_tensors="pt",
#         )
#         input_ids = text_inputs.input_ids.to(device)
#         attention_mask = text_inputs.attention_mask.to(device) if "attention_mask" in text_inputs else None

#         # Encode the subprompt into text embeddings
#         text_embeds = pipe.text_encoder(input_ids, attention_mask=attention_mask)[0]
#         encoded_prompts.append(text_embeds)
    
#     # Compute the weighted sum of the embeddings
#     weighted_embedding = sum(weight * embeds for weight, embeds in zip(subprompt_weights, encoded_prompts))
#     weight_total = sum(subprompt_weights)
#     combined_embedding = weighted_embedding / weight_total  # Normalize if desired

#     # Duplicate embeddings for each image per prompt if necessary
#     batch_size, seq_len, embed_dim = combined_embedding.shape
#     combined_embedding = combined_embedding.repeat(1, num_images_per_prompt, 1)
#     combined_embedding = combined_embedding.view(batch_size * num_images_per_prompt, seq_len, embed_dim)
    
#     return combined_embedding

# def generate_and_save_images_prompt_weights(scenes, characters_dict, pipe, save_dir, device,
#                              negative_prompt="low quality, photorealistic, 3d render, overly detailed, digital art, painting, vibrant colors, fine art, NSFW",
#                              num_inference_steps=50):
#     """
#     Generate images for each scene using prompt embeddings from the provided pipeline
#     and save each image to the specified directory with a unique filename.

#     Args:
#         scenes (list): List of scene objects.
#         characters_dict (dict): Dictionary of character descriptions.
#         pipe: The Stable Diffusion pipeline instance.
#         save_dir (str): The directory where images will be saved.
#         device (str): The device to use ("cuda" or "cpu").
#         negative_prompt (str, optional): Negative prompt to steer generation.
#         num_inference_steps (int, optional): Number of inference steps for image generation.

#     Returns:
#         list: List of generated PIL.Image objects.
#     """
#     print("Generating images...")
#     os.makedirs(save_dir, exist_ok=True)
#     generated_images = []
#     for i, scene in enumerate(scenes):
#         # scene_prompts = _scene_to_prompts(scene, characters_dict)
#         # subprompt_texts, subprompt_weights = format_subprompts_for_diffusion(scene_prompts)
#         subprompt_texts, subprompt_weights = scenes_to_formatted_prompts([scene], characters_dict)[0]
#         combined_embeddings = weighted_sum_prompt_embeddings(pipe, subprompt_texts, subprompt_weights, device)
        
#         with torch.no_grad():
#             output = pipe(prompt_embeds=combined_embeddings,
#                           negative_prompt=negative_prompt,
#                           num_inference_steps=num_inference_steps)
#         generated_image = output.images[0]
#         generated_images.append(generated_image)
#         image_path = os.path.join(save_dir, f"image_{i+1}.png")
#         generated_image.save(image_path)
#         print(f"Image {i+1} saved to {image_path}")
        
#     return generated_images

# save_directory = "stories/prompt_weight"
# generated_images = generate_and_save_images_prompt_weights(scenes, characters_dict, pipe, save_directory, device)

## Modified Classifier-Free Guidance

In [None]:
# import os
# import torch
# from diffusers import StableDiffusionPipeline, UniPCMultistepScheduler

# def encode_subprompt(pipe: StableDiffusionPipeline, text: str, device: str = "cuda"):
#     """
#     Tokenize and encode a single subprompt into a [batch_size=1, seq_len, hidden_dim] embedding.
#     """
#     text_inputs = pipe.tokenizer(
#         text,
#         padding="max_length",
#         max_length=pipe.tokenizer.model_max_length,
#         truncation=True,
#         return_tensors="pt",
#     )
#     text_embeds = pipe.text_encoder(
#         text_inputs.input_ids.to(device),
#         attention_mask=text_inputs.attention_mask.to(device)
#     )[0]
#     return text_embeds

### Single Unconditional Pass + Multiple Conditional Passes

Let $\hat{\epsilon}_{\text{cond\_combined}}=\frac{1}{\sum_{i=1}^nw_i}\sum_{i=1}^nw_i\hat{\epsilon}_{\text{cond}_i}$
where we have one pass per subprompt to get $\hat{\epsilon}_{\text{cond}_i}$ and $n$ is the number of subprompts.
Then the classifier free guidance with scale $g$ is $$\hat{\epsilon}=\hat{\epsilon}_{\text{uncond}}+g(\hat{\epsilon}_{\text{cond\_combined}}-\hat{\epsilon}_{\text{uncond}})$$
where we have one unconditional pass at each step to get $\hat{\epsilon}_{\text{uncond}}$

- Total UNet calls per step: $1+n$
- Each subprompt has a relative weight but they all share the same baseline unconditional pass

In [None]:
# class MultiPromptPipelineApproach1(StableDiffusionPipeline):
#     """
#     Multi-Prompt CFG with a SINGLE unconditional pass:
#       - At each diffusion step:
#         1. uncond_out = UNet(latent, uncond_embeds)
#         2. cond_out_i = UNet(latent, cond_embeds_i) for each subprompt i
#         3. cond_combined = weighted average of all cond_out_i
#         4. final_out = uncond_out + guidance_scale*(cond_combined - uncond_out)
#     """

#     @torch.no_grad()
#     def __call__(
#         self,
#         subprompt_embeds: list[torch.Tensor],
#         subprompt_weights: list[float],
#         uncond_embeds: torch.Tensor,
#         height: int = 512,
#         width: int = 512,
#         guidance_scale: float = 7.5,
#         num_inference_steps: int = 50,
#         generator: torch.Generator = None,
#         latents: torch.Tensor = None,
#         output_type: str = "pil",
#         return_dict: bool = True,
#         **kwargs
#     ):
#         device = self._execution_device
#         batch_size = uncond_embeds.shape[0]
#         num_subprompts = len(subprompt_embeds)

#         if num_subprompts != len(subprompt_weights):
#             raise ValueError("subprompt_embeds and subprompt_weights must have the same length.")

#         # 1. Validate or fallback to default height/width
#         if not height or not width:
#             height, width = self._default_height_width()

#         # 2. Set timesteps on the scheduler
#         self.scheduler.set_timesteps(num_inference_steps, device=device)
#         timesteps = self.scheduler.timesteps

#         # 3. Prepare latents
#         if latents is None:
#             shape = (batch_size, self.unet.config.in_channels, height // 8, width // 8)
#             latents = torch.randn(shape, generator=generator, device=device, dtype=uncond_embeds.dtype)
#             latents = latents * self.scheduler.init_noise_sigma
#         else:
#             latents = latents.to(device)

#         # 4. Diffusion loop
#         for i, t in enumerate(timesteps):
#             latent_model_input = self.scheduler.scale_model_input(latents, t)

#             # (A) Unconditional pass
#             uncond_out = self.unet(latent_model_input, t, encoder_hidden_states=uncond_embeds, **kwargs).sample

#             # (B) Conditional passes (one per subprompt)
#             cond_outs = []
#             for cond_embed in subprompt_embeds:
#                 out = self.unet(latent_model_input, t, encoder_hidden_states=cond_embed, **kwargs).sample
#                 cond_outs.append(out)

#             # (C) Weighted average of conditional outputs
#             total_w = sum(subprompt_weights)
#             cond_combined = sum(w * o for w, o in zip(subprompt_weights, cond_outs)) / total_w

#             # (D) Classifier-Free Guidance
#             guided_out = uncond_out + guidance_scale * (cond_combined - uncond_out)

#             # (E) Step
#             latents = self.scheduler.step(guided_out, t, latents, **kwargs).prev_sample

#         # 5. Decode latents
#         if output_type == "latent":
#             if return_dict:
#                 from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
#                 return StableDiffusionPipelineOutput(images=latents, nsfw_content_detected=None)
#             return latents

#         image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
#         image = self.image_processor.postprocess(image, output_type=output_type)

#         if return_dict:
#             from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
#             return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=None)
#         return image

# # --- Load the Multi-Prompt Approach 1 pipeline ---
# print("Loading Approach 1 pipeline for scenes...")
# pipe1 = MultiPromptPipelineApproach1.from_pretrained(
#     "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16
# ).to("cuda")
# pipe1.scheduler = UniPCMultistepScheduler.from_config(pipe1.scheduler.config)
# pipe1.enable_model_cpu_offload()
# pipe1.enable_attention_slicing()

####### EXAMPLE
# # Subprompts
# subprompts = [
#     "ancient forest, misty atmosphere",
#     "mysterious ruins in the distance"
# ]
# # Encode each subprompt
# subprompt_embeds_1 = [encode_subprompt(pipe1, sp) for sp in subprompts]

# # Encode unconditional
# uncond_embeds_1 = encode_subprompt(pipe1, "")  # blank or negative prompt

# # Generate with approach 1

# # Weights for each subprompt
# weights_1 = [2.0, 5.0]
# print("Generating image with Approach 1 (single unconditional pass)...")
# output1 = pipe1(
#     subprompt_embeds=subprompt_embeds_1,
#     subprompt_weights=weights_1,
#     uncond_embeds=uncond_embeds_1,
#     guidance_scale=7.5,
#     num_inference_steps=25
# )
# output1.images[0].save("approach1_result.png")
# print("Saved approach1_result.png")

# # --- Generate images for each scene ---
# def generate_and_save_images_multi_prompt(scenes, characters_dict, pipe, save_dir, device,
#                                           num_inference_steps=50, guidance_scale=7.5):
#     """
#     Generate images for each scene using the Multi-Prompt pipeline and save each image to the specified directory.
    
#     Args:
#         scenes (list): List of scene objects (each scene is a dict).
#         characters_dict (dict): Dictionary of character descriptions.
#         pipe: The MultiPromptPipelineApproach1 pipeline instance.
#         save_dir (str): Directory where images will be saved.
#         device (str): Device to use (e.g., "cuda" or "cpu").
#         num_inference_steps (int, optional): Number of diffusion steps.
#         guidance_scale (float, optional): Guidance scale for classifier-free guidance.
        
#     Returns:
#         list: List of generated PIL.Image objects.
#     """
#     import os
#     import torch

#     os.makedirs(save_dir, exist_ok=True)
#     generated_images = []
#     uncond_embeds = encode_subprompt(pipe,
#                                      "low quality, photorealistic, 3d render, overly detailed, digital art, painting, vibrant colors, fine art, NSFW",
#                                      device=device)

#     # Iterate over scenes
#     for i, scene in enumerate(scenes):
#         # Convert the scene to subprompts and their corresponding weights
#         subprompt_texts, subprompt_weights = scenes_to_formatted_prompts([scene], characters_dict)[0]

#         # Encode each subprompt into an embedding
#         subprompt_embeds = [encode_subprompt(pipe, sp, device=device) for sp in subprompt_texts]
#         # Encode unconditional (negative/blank) prompt for the baseline
#         # uncond_embeds = encode_subprompt(pipe, "", device=device)

#         print(f"Generating image for scene {i+1}...")
#         with torch.no_grad():
#             output = pipe(
#                 subprompt_embeds=subprompt_embeds,
#                 subprompt_weights=subprompt_weights,
#                 uncond_embeds=uncond_embeds,
#                 guidance_scale=guidance_scale,
#                 num_inference_steps=num_inference_steps
#             )
#         generated_image = output.images[0]
#         generated_images.append(generated_image)
#         image_path = os.path.join(save_dir, f"scene_{i+1}.png")
#         generated_image.save(image_path)
#         print(f"Image {i+1} saved to {image_path}")

#     return generated_images

# save_directory = "stories/multi_prompt_approach1"
# generated_images = generate_and_save_images_multi_prompt(scenes, characters_dict, pipe1, save_directory, device)

### Multiple Unconditional Passes (One per Subprompt)

We have $$\hat{\epsilon}=\hat{\epsilon}_{\text{uncond}}+g\sum_{i=1}^nw_i(\hat{\epsilon}_{\text{cond}_i}-\hat{\epsilon}_{\text{uncond}_i})$$
- Total UNet calls per step: $1+2n$ (One global unconditional + two passes for each subprompt)

In [None]:
# class MultiPromptPipelineApproach2(StableDiffusionPipeline):
#     """
#     Multi-Prompt CFG with MULTIPLE unconditional passes:
#       - 1 global unconditional pass per step: e_uncond
#       - For each subprompt i:
#           e_uncond_i (subprompt-specific unconditional)
#           e_cond_i    (subprompt conditional)
#       - Combine: e = e_uncond + g * sum_i[ w_i * ( e_cond_i - e_uncond_i ) ]
#     """

#     @torch.no_grad()
#     def __call__(
#         self,
#         global_uncond_embeds: torch.Tensor,
#         subprompt_pairs: list[tuple[torch.Tensor, torch.Tensor]],
#         subprompt_weights: list[float],
#         guidance_scale: float = 7.5,
#         height: int = 512,
#         width: int = 512,
#         num_inference_steps: int = 50,
#         generator: torch.Generator = None,
#         latents: torch.Tensor = None,
#         output_type: str = "pil",
#         return_dict: bool = True,
#         **kwargs
#     ):
#         """
#         Args:
#             global_uncond_embeds (Tensor): [batch, seq_len, hidden_dim] for the entire prompt's unconditional pass.
#             subprompt_pairs (list of (uncond_i, cond_i)):
#                 Each element is a tuple: (uncond_embeds_i, cond_embeds_i).
#             subprompt_weights (list[float]): Weights w_i for each subprompt i.
#         """
#         device = self._execution_device
#         batch_size = global_uncond_embeds.shape[0]
#         num_subprompts = len(subprompt_pairs)

#         if num_subprompts != len(subprompt_weights):
#             raise ValueError("subprompt_pairs and subprompt_weights must have the same length.")

#         # 1. Validate or fallback to default
#         if not height or not width:
#             height, width = self._default_height_width()

#         # 2. Scheduler timesteps
#         self.scheduler.set_timesteps(num_inference_steps, device=device)
#         timesteps = self.scheduler.timesteps

#         # 3. Prepare latents
#         if latents is None:
#             shape = (batch_size, self.unet.config.in_channels, height // 8, width // 8)
#             latents = torch.randn(shape, generator=generator, device=device, dtype=global_uncond_embeds.dtype)
#             latents = latents * self.scheduler.init_noise_sigma
#         else:
#             latents = latents.to(device)

#         # 4. Diffusion loop
#         for i, t in enumerate(timesteps):
#             latent_model_input = self.scheduler.scale_model_input(latents, t)

#             # (A) Single global unconditional pass
#             e_uncond_global = self.unet(
#                 latent_model_input, t, encoder_hidden_states=global_uncond_embeds, **kwargs
#             ).sample

#             # (B) For each subprompt: unconditional + conditional
#             sub_deltas = []
#             for (uncond_i, cond_i), w in zip(subprompt_pairs, subprompt_weights):
#                 e_uncond_i = self.unet(latent_model_input, t, encoder_hidden_states=uncond_i, **kwargs).sample
#                 e_cond_i = self.unet(latent_model_input, t, encoder_hidden_states=cond_i, **kwargs).sample

#                 # Delta for subprompt i
#                 delta_i = w * (e_cond_i - e_uncond_i)
#                 sub_deltas.append(delta_i)

#             # (C) Combine sub-deltas
#             sum_deltas = sum(sub_deltas)  # sum_i w_i ( e_cond_i - e_uncond_i )

#             # (D) Final output
#             guided_out = e_uncond_global + guidance_scale * sum_deltas

#             # (E) Scheduler step
#             latents = self.scheduler.step(guided_out, t, latents, **kwargs).prev_sample

#         # 5. Decode
#         if output_type == "latent":
#             if return_dict:
#                 from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
#                 return StableDiffusionPipelineOutput(images=latents, nsfw_content_detected=None)
#             return latents

#         image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
#         image = self.image_processor.postprocess(image, output_type=output_type)

#         if return_dict:
#             from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
#             return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=None)
#         return image

# print("Loading Approach 2 pipeline...")
# pipe2 = MultiPromptPipelineApproach2.from_pretrained(
#     "runwayml/stable-diffusion-v1-5",
#     torch_dtype=torch.float16
# ).to("cuda")
# pipe2.scheduler = UniPCMultistepScheduler.from_config(pipe2.scheduler.config)
# pipe2.enable_model_cpu_offload()
# pipe2.enable_attention_slicing()

#### EXAMPLE
# # Suppose we want environment and style separately
# global_uncond = encode_subprompt(pipe2, "")  # global unconditional
# env_uncond = encode_subprompt(pipe2, "")     # unconditional for environment
# env_cond   = encode_subprompt(pipe2, "ancient forest, misty atmosphere")
# style_uncond = encode_subprompt(pipe2, "")   # unconditional for style
# style_cond   = encode_subprompt(pipe2, "cinematic style, high contrast")

# # subprompt_pairs = [ (uncond_env, cond_env), (uncond_style, cond_style) ]
# subprompt_pairs_2 = [
#     (env_uncond, env_cond),
#     (style_uncond, style_cond)
# ]

# weights_2 = [1.5, 1.8]
# print("Generating image with Approach 2 (multiple unconditional passes)...")
# output2 = pipe2(
#     global_uncond_embeds=global_uncond,
#     subprompt_pairs=subprompt_pairs_2,
#     subprompt_weights=weights_2,
#     guidance_scale=7.5,
#     num_inference_steps=25
# )
# output2.images[0].save("approach2_result.png")
# print("Saved approach2_result.png")

# def generate_and_save_images_multi_prompt2(scenes, characters_dict, pipe, save_dir, device,
#                                              num_inference_steps=50, guidance_scale=7.5):
#     """
#     Generate images for each scene using Multi-Prompt Approach 2 (multiple unconditional passes)
#     and save each image to the specified directory.
    
#     Args:
#         scenes (list): List of scene objects (each scene is a dict).
#         characters_dict (dict): Dictionary of character descriptions.
#         pipe: The MultiPromptPipelineApproach2 pipeline instance.
#         save_dir (str): Directory where images will be saved.
#         device (str): Device to use (e.g., "cuda" or "cpu").
#         num_inference_steps (int, optional): Number of diffusion steps.
#         guidance_scale (float, optional): Guidance scale for classifier-free guidance.
        
#     Returns:
#         list: List of generated PIL.Image objects.
#     """
#     import os
#     import torch

#     os.makedirs(save_dir, exist_ok=True)
#     generated_images = []

#     for i, scene in enumerate(scenes):
#         # Get subprompt texts and corresponding weights for the scene.
#         subprompt_texts, subprompt_weights = scenes_to_formatted_prompts([scene], characters_dict)[0]

#         # Encode the global unconditional prompt once.
#         global_uncond_embeds = encode_subprompt(pipe, "", device=device)

#         # For each subprompt, encode a pair: (unconditional, conditional)
#         subprompt_pairs = []
#         for sp in subprompt_texts:
#             uncond_i = encode_subprompt(pipe, "", device=device)
#             cond_i = encode_subprompt(pipe, sp, device=device)
#             subprompt_pairs.append((uncond_i, cond_i))

#         print(f"Generating image for scene {i+1} using Approach 2...")
#         with torch.no_grad():
#             output = pipe(
#                 global_uncond_embeds=global_uncond_embeds,
#                 subprompt_pairs=subprompt_pairs,
#                 subprompt_weights=subprompt_weights,
#                 guidance_scale=guidance_scale,
#                 num_inference_steps=num_inference_steps
#             )
#         generated_image = output.images[0]
#         generated_images.append(generated_image)
#         image_path = os.path.join(save_dir, f"scene_{i+1}_approach2.png")
#         generated_image.save(image_path)
#         print(f"Image {i+1} saved to {image_path}")

#     return generated_images

# # Example usage:
# save_directory = "stories/multi_prompt_approach2"
# generated_images = generate_and_save_images_multi_prompt2(scenes, characters_dict, pipe2, save_directory, device)

Stopped this because it's extremely slow (20 min for one image) and it's not good either.