In [1]:
from dotenv import load_dotenv
import json
from together import Together
from pydantic import BaseModel, Field
from typing import List, Dict

In [2]:
load_dotenv()
together = Together() # add .env file with TOGETHER_API_KEY variable

In [3]:
characters_dict = {
    "Don Vito Corleone": {
        "age": "early 60s", "gender": "male", "hair": "slicked-back gray-black hair",
        "clothing": "dark three-piece suit",
        "body_type": "stocky, slightly hunched posture",
        "accessories": "gold ring on right hand, pocket watch",
        "ethnicity": "Italian-American"
    },
    "Tom Hagen": {
        "age": "early 40s", "gender": "male", "hair": "short, neatly combed brown hair",
        "facial_hair": "clean-shaven", "clothing": "gray suit, dark tie",
        "body_type": "medium build, upright posture", "ethnicity": "German-Irish"
    },
    "Johnny Fontane": {
        "age": "late 30s", "gender": "male", "hair": "short, slicked-back black hair",
        "facial_hair": "clean shaven", "clothing": "dark, stylish suit with an open collar",
        "body_type": "slim and fit", "accessories": "gold ring, cigarette"
    },
    "Sonny": {
        "age": "early 30s", "gender": "male", "hair": "curly, dark brown hair",
        "facial_hair": "clean-shaven", "clothing": "formal suit, slightly disheveled",
        "body_type": "athletic build", "ethnicity": "Italian-American",
    }

}

In [4]:
def _build_character_description(characters_dict: Dict[str, str]):
    """
    Generates text description of character from the dictionary
    """
    features = [
        characters_dict.get("ethnicity", ""),
        characters_dict.get("age", ""),
        characters_dict.get("gender", ""),
        characters_dict.get("hair", ""),
        characters_dict.get("facial_hair", ""),
        characters_dict.get("body_type", ""),
        f"wearing {characters_dict.get('clothing', '')}",
        f"with {characters_dict.get('accessories', '')}" if characters_dict.get("accessories") else ""
    ]
    
    return ", ".join(filter(None, features))

In [5]:
ORIENTATIONS = [
    "Front View", "Profile View", "Back View", "From Behind", "From Above",
    "From Below", "Three-Quarters View", "Long Shot", "Three-Quarters Rear View"
]

CAMERA_SHOTS = [
    "Aerial View", "Bird’s-Eye View", "Close-Up", "Cowboy Shot", "Dolly Zoom",
    "Dutch Angle", "Establishing Shot", "Extreme Close-Up", "Extreme Long Shot",
    "Full Shot", "Long Shot", "Medium Close-Up", "Medium Long Shot", "Medium Shot",
    "Over-the-Shoulder Shot", "Point-of-View Shot", "Two-Shot", "Fisheye Shot",
    "Worm's Eye", "Low-Angle Shot", "Macro Shot", "Tilt-Shift Shot", "Telephoto Shot"
]

class Character(BaseModel):
    name: str  # Only name is needed per your example output
    
class Scene(BaseModel):
    scene_number: int
    shot_type: str
    orientation: str
    characters: List[Character]
    environment: str  # This was missing in your Scene class
    description: str
    
class SceneList(BaseModel):
    scenes: List[Scene]

In [6]:
def input_to_json(script: str, characters: Dict[str, Dict], temperature: int = 0.7):
    """
    Converts a script and character descriptions into a JSON format for storyboard generation.
    """
    character_descriptions = {name: _build_character_description(desc) for name, desc in characters.items()}
    
    # Create the content outside of f-string to avoid nesting
    script_section = f"Here is the film script: \n{script}"
    characters_section = f"The characters in the script have the following descriptions: \n{json.dumps(character_descriptions, indent=2)}"
    
    instructions = """
### Storyboard Generation Instructions
1. **Number of Scenes**: Divide the entire script into a reasonable number of scenes (typically between 4 to 7 scenes), not too many or too few.
2. **Single Distinct Moment**: Each scene captures a single moment.
3. **Camera Angles & Orientation**: Choose from these shot types: {CAMERA_SHOTS}.  
Choose from these orientations: {ORIENTATIONS}.
4. **Location & Time**: Clearly derive environment from the script (e.g. INT DAY, DON'S OFFICE, etc.). Describe it in its details (size, lighhting, mood, organization of the objects, etc.). Notice that if it's the same across the different scenes, it must be written in the same way
5. **Characters**:
- List only characters relevant to the single moment in each scene.
- Each character must have the name and a short description (consistent from provided descriptions).
6. Clearly describe the scene including actions, character positions (foreground, background, left, right), emotions, and expressions.
7. **Scene Format**: Return JSON with a key 'scenes' as an array of structured objects:
- "scene_number": integer
- "shot_type": camera shot type (from provided list) 
- "orientation": orientation (from provided list)
- "characters": list of objects with:
        - "name": character's name, not as they appear on the script but as they were given to you in the description.
- "environment": short description of the location
- "description": short, vivid description focusing on actions, expressions, emotions of each single character. Also their relative position is clearly described. The description must be succint, without extra articles or words, it should be visual and useful for an image generation prompt. Ensure it makes sense with the shot type (e.g., if it's medium shot, don't say that the facce is covering the full image, otherwise it should be a close up).

Follow the above instructions very carefully. Notice that the scenes have no knowledge of each other's contents. So in case something is necessary, describe it again. 
"""

    example_input = """
### Example
Input: 
- Script is 
INT DAY: DON'S OFFICE (SUMMER 1945)

        DON CORLEONE
ACT LIKE A MAN!  By Christ in
Heaven, is it possible you turned
out no better than a Hollywood
finocchio.

Both HAGEN and JOHNNY cannot refrain from laughing.  The DON
smiles.  SONNY enters as noiselessly as possible, still
adjusting his clothes.

        DON CORLEONE
All right, Hollywood...Now tell me
about this Hollywood Pezzonovanta
who won't let you work.

        JOHNNY
He owns the studio.  Just a month
ago he bought the movie rights to
this book, a best seller.  And the
main character is a guy just like
me.  I wouldn't even have to act,
just be myself.

The DON is silent, stern.

        DON CORLEONE
You take care of your family?

        JOHNNY
Sure.

He glances at SONNY, who makes himself as inconspicuous as
he can.

        DON CORLEONE
You look terrible.  I want you to
eat well, to rest.  And spend time
with your family.  And then, at the
end of the month, this big shot
will give you the part you want.

        JOHNNY
It's too late.  All the contracts
have been signed, they're almost
ready to shoot.

        DON CORLEONE
I'll make him an offer he can't
refuse.

He takes JOHNNY to the door, pinching his cheek hard enough
to hurt.

        DON CORLEONE
Now go back to the party and leave
it to me.

He closes the door, smiling to himself.  Turns to HAGEN.

        DON CORLEONE
When does my daughter leave with
her bridegroom?

        HAGEN
They'll cut the cake in a few
minutes...leave right after that.
Your new son-in-law, do we give him
something important?

        DON CORLEONE
No, give him a living.  But never
let him know the family's business.
What else, Tom?

        HAGEN
I've called the hospital; they've
notified Consigliere Genco's family
to come and wait.  He won't last
out the night.

This saddens the DON.  He sighs.

        DON CORLEONE
Genco will wait for me.  Santino,
tell your brothers they will come
with me to the hospital to see
Genco.  Tell Fredo to drive the big
car, and ask Johnny to come with us.

        SONNY
And Michael?

        DON CORLEONE
All my sons.
        (to HAGEN)
Tom, I want you to go to California
tonight.  Make the arrangements.
But don't leave until I come back
from the hospital and speak to you.
Understood?

        HAGEN
Understood.

- Characters description from the dictionary gives
        - Don Vito Corleone: 'Italian-American, early 60s, male, slicked-back gray-black hair, stocky, slightly hunched posture, wearing dark three-piece suit, with gold ring on right hand, pocket watch'
        - Johnny Fontane: 'late 30s, male, short, slicked-back black hair, clean shaven, slim and fit, wearing dark, stylish suit with an open collar, with gold ring, cigarette'
        - Tom Hagen: 'German-Irish, early 40s, male, short, neatly combed brown hair, clean-shaven, medium build, upright posture, wearing gray suit, dark tie'
        - Sonny: 'Italian-American, early 30s, male, curly, dark brown hair, clean-shaven, athletic build, wearing formal suit, slightly disheveled'
"""

    example_output = """
Example Output:
{
   "scenes": [
   {
   "scene_number": 1,
   "shot_type": "Medium Shot",
   "orientation": "Front View",
   "characters": [
           {
           "name": "Don Vito Corleone"
           },
           {
           "name": "Johnny Fontane"
           },
           {
           "name": "Tom Hagen"
           }
   ],
   "environment": "Don's office, daytime, summer 1945. Elegant wood-paneled room with large desk, leather chairs, warm lighting filtering through venetian blinds.",
   "description": "Don Corleone stands imposingly behind desk, face stern with righteous anger, pointing finger at Johnny. Johnny appears embarrassed, head slightly bowed. Hagen stands to the right, barely containing laughter. Tension and amusement mix in intimate office atmosphere."
   },
   {
   "scene_number": 2,
   "shot_type": "Two-Shot",
   "orientation": "Profile View",
   "characters": [
           {
           "name": "Don Vito Corleone"
           },
           {
           "name": "Johnny Fontane"
           },
           {
           "name": "Tom Hagen"
           },
           {
           "name": "Sonny"
           }
   ],
   "environment": "Don's office, daytime, summer 1945. Elegant wood-paneled room with large desk, leather chairs, warm lighting filtering through venetian blinds.",
   "description": "Sonny quietly enters room from right, adjusting disheveled clothes. Don leans forward at desk, expression softening to business-like focus. Johnny stands center, straightening posture. Hagen observes from left corner. Atmosphere shifts from personal rebuke to business discussion."
   },
   {
   "scene_number": 3,
   "shot_type": "Close-Up",
   "orientation": "Front View",
   "characters": [
           {
           "name": "Don Vito Corleone"
           }
   ],
   "environment": "Don's office, daytime, summer 1945. Elegant wood-paneled room with large desk, leather chairs, warm lighting filtering through venetian blinds.",
   "description": "Don Corleone's face fills frame, stern and contemplative. Eyes narrowed, jaw set firmly. Saying 'I'll make him an offer he can't refuse' with quiet, confident menace. Power and authority emanate from his expression."
   },
   {
   "scene_number": 4,
   "shot_type": "Medium Close-Up",
   "orientation": "Three-Quarters View",
   "characters": [
           {
           "name": "Don Vito Corleone"
           },
           {
           "name": "Johnny Fontane"
           }
   ],
   "environment": "Don's office, daytime, summer 1945. Elegant wood-paneled room with large desk, leather chairs, warm lighting filtering through venetian blinds.",
   "description": "Don Corleone escorts Johnny to door, pinching his cheek firmly. Don's expression shows affection mixed with dominance. Johnny winces slightly at pain while showing relief and gratitude. Door frame visible on right edge of shot."
   },
   {
   "scene_number": 5,
   "shot_type": "Medium Shot",
   "orientation": "Front View",
   "characters": [
           {
           "name": "Don Vito Corleone"
           },
           {
           "name": "Tom Hagen"
           }
   ],
   "environment": "Don's office, daytime, summer 1945. Elegant wood-paneled room with large desk, leather chairs, warm lighting filtering through venetian blinds.",
   "description": "Don Corleone turns from closed door, small smile fading to serious business expression. Hagen stands attentively near desk, notepad ready. Don moves toward chair, shoulders slightly hunched, gold ring catching light as he gestures."
   },
   {
   "scene_number": 6,
   "shot_type": "Over-the-Shoulder Shot",
   "orientation": "Profile View",
   "characters": [
           {
           "name": "Don Vito Corleone"
           },
           {
           "name": "Tom Hagen"
           },
           {
           "name": "Sonny"
           }
   ],
   "environment": "Don's office, daytime, summer 1945. Elegant wood-paneled room with large desk, leather chairs, warm lighting filtering through venetian blinds.",
   "description": "Camera over Don's shoulder, facing Hagen and Sonny. Don's gray-black hair and dark suit visible in foreground. Hagen's face shows respectful attention. Sonny stands beside him, now composed. Don's voice carries weight as he issues final instructions about hospital visit."
   }
   ]
}
"""

    # Combine all content together without nesting f-strings
    user_content = f"{script_section}\n\n{characters_section}\n\n{instructions}\n{example_input}\n{example_output}"
    
    messages = [
        {"role": "system", "content": (
            "You are an AI specialized in creating structured storyboard scenes from a film script "
            "for image generation (e.g., stable diffusion). Each scene must capture a single distinct moment, "
            "should list relevant characters with consistent appearances, specify the environment, camera shot, "
            "and orientation, and provide direct clues for a diffusion model to generate images."
            )},
        {"role": "user", "content": user_content}
    ]
    
    response = together.chat.completions.create(
        model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
        messages=messages,
        max_tokens=10000,
        temperature=temperature,
        response_format={"type": "json_object", "schema": SceneList.model_json_schema()}
    )

    try:
        output_json = response.choices[0].message.content
        return json.loads(output_json)["scenes"]
    except (json.JSONDecodeError, KeyError) as e:
        print("Error parsing JSON output:", e)
        return []

In [7]:
script = """
INT DAY: DON'S OFFICE (SUMMER 1945)

				DON CORLEONE
		ACT LIKE A MAN!  By Christ in
		Heaven, is it possible you turned
		out no better than a Hollywood
		finocchio.

	Both HAGEN and JOHNNY cannot refrain from laughing.  The DON
	smiles.  SONNY enters as noiselessly as possible, still
	adjusting his clothes.

				DON CORLEONE
		All right, Hollywood...Now tell me
		about this Hollywood Pezzonovanta
		who won't let you work.

				JOHNNY
		He owns the studio.  Just a month
		ago he bought the movie rights to
		this book, a best seller.  And the
		main character is a guy just like
		me.  I wouldn't even have to act,
		just be myself.

	The DON is silent, stern.

				DON CORLEONE
		You take care of your family?

				JOHNNY
		Sure.

	He glances at SONNY, who makes himself as inconspicuous as
	he can.

				DON CORLEONE
		You look terrible.  I want you to
		eat well, to rest.  And spend time
		with your family.  And then, at the
		end of the month, this big shot
		will give you the part you want.

				JOHNNY
		It's too late.  All the contracts
		have been signed, they're almost
		ready to shoot.

				DON CORLEONE
		I'll make him an offer he can't
		refuse.

	He takes JOHNNY to the door, pinching his cheek hard enough
	to hurt.

				DON CORLEONE
		Now go back to the party and leave
		it to me.

	He closes the door, smiling to himself.  Turns to HAGEN.

				DON CORLEONE
		When does my daughter leave with
		her bridegroom?

				HAGEN
		They'll cut the cake in a few
		minutes...leave right after that.
		Your new son-in-law, do we give him
		something important?

				DON CORLEONE
		No, give him a living.  But never
		let him know the family's business.
		What else, Tom?

				HAGEN
		I've called the hospital; they've
		notified Consiglere Genco's family
		to come and wait.  He won't last
		out the night.

	This saddens the DON.  He sighs.

				DON CORLEONE
		Genco will wait for me.  Santino,
		tell your brothers they will come
		with me to the hospital to see
		Genco.  Tell Fredo to drive the big
		car, and ask Johnny to come with us.

				SONNY
		And Michael?

				DON CORLEONE
		All my sons.
			  (to HAGEN)
		Tom, I want you to go to California
		tonight.  Make the arrangements.
		But don't leave until I come back
		from the hospital and speak to you.
		Understood?

				HAGEN
		Understood.
"""

In [8]:
scenes = input_to_json(script, characters_dict)
print(json.dumps(scenes, indent=4))

[
    {
        "scene_number": 1,
        "shot_type": "Medium Shot",
        "orientation": "Front View",
        "characters": [
            {
                "name": "Don Vito Corleone"
            },
            {
                "name": "Johnny Fontane"
            },
            {
                "name": "Tom Hagen"
            }
        ],
        "environment": "Don's office, daytime, summer 1945. Elegant wood-paneled room with large desk, leather chairs, warm lighting filtering through venetian blinds.",
        "description": "Don Corleone stands imposingly behind desk, face stern with righteous anger, pointing finger at Johnny. Johnny appears embarrassed, head slightly bowed. Hagen stands to the right, barely containing laughter. Tension and amusement mix in intimate office atmosphere."
    },
    {
        "scene_number": 2,
        "shot_type": "Two-Shot",
        "orientation": "Profile View",
        "characters": [
            {
                "name": "Don Vito Corleo

In [9]:
scene = scenes[0]
scene

{'scene_number': 1,
 'shot_type': 'Medium Shot',
 'orientation': 'Front View',
 'characters': [{'name': 'Don Vito Corleone'},
  {'name': 'Johnny Fontane'},
  {'name': 'Tom Hagen'}],
 'environment': "Don's office, daytime, summer 1945. Elegant wood-paneled room with large desk, leather chairs, warm lighting filtering through venetian blinds.",
 'description': 'Don Corleone stands imposingly behind desk, face stern with righteous anger, pointing finger at Johnny. Johnny appears embarrassed, head slightly bowed. Hagen stands to the right, barely containing laughter. Tension and amusement mix in intimate office atmosphere.'}

In [10]:
def _scene_to_prompts(scene, characters_dict, style="storyboard"):
    """
    Converts a scene into structured sub-prompts, where each character gets their own entry (character1, character2, etc.).
    """
    
    if style == "storyboard":
        style = "rough b&w pencil sketch, simple sketch lines, minimal shading, rough hatching, draft-style, J.C. Leyendecker style"
    # TODO manga style

    # subprompts = {
    #     "style": style,
    #     "environment": scene["environment"],
    #     "shot": f"{scene['shot_type']}, {scene['orientation']}",
    #     "description": scene["description"]
    # }
    
    subprompts = {}

    # Add each character as a separate sub-prompt
    for i, char in enumerate(scene["characters"]):
        char_name = char["name"]
        char_desc = _build_character_description(characters_dict[char_name])
        subprompts[f"character{i+1}"] = f"{char_name}: {char_desc}"
        
    subprompts["style"] = style
    subprompts["environment"] = scene["environment"]
    subprompts["shot"] = f"{scene['shot_type']}, {scene['orientation']}"
    subprompts["description"] = scene["description"]

    return subprompts

In [11]:
_scene_to_prompts(scene, characters_dict)

{'character1': 'Don Vito Corleone: Italian-American, early 60s, male, slicked-back gray-black hair, stocky, slightly hunched posture, wearing dark three-piece suit, with gold ring on right hand, pocket watch',
 'character2': 'Johnny Fontane: late 30s, male, short, slicked-back black hair, clean shaven, slim and fit, wearing dark, stylish suit with an open collar, with gold ring, cigarette',
 'character3': 'Tom Hagen: German-Irish, early 40s, male, short, neatly combed brown hair, clean-shaven, medium build, upright posture, wearing gray suit, dark tie',
 'style': 'rough b&w pencil sketch, simple sketch lines, minimal shading, rough hatching, draft-style, J.C. Leyendecker style',
 'environment': "Don's office, daytime, summer 1945. Elegant wood-paneled room with large desk, leather chairs, warm lighting filtering through venetian blinds.",
 'shot': 'Medium Shot, Front View',
 'description': 'Don Corleone stands imposingly behind desk, face stern with righteous anger, pointing finger at 

In [12]:
def all_scenes_to_prompts(scenes, characters_dict, style="storyboard"):
    """
    Converts all scenes into structured diffusion model prompts.

    Parameters:
    - scenes (list): List of structured scenes.
    - style (str): The desired artistic style.

    Returns:
    - list: List of formatted text prompts.
    """
    
    if style == "storyboard":
        style = "rough b&w pencil sketch, simple sketch lines, minimal shading, rough hatching, draft-style, J.C. Leyendecker style"
    
    return [_scene_to_prompts(scene, characters_dict, style) for scene in scenes]

In [13]:
all_scenes_to_prompts(scenes, characters_dict)[0]

{'character1': 'Don Vito Corleone: Italian-American, early 60s, male, slicked-back gray-black hair, stocky, slightly hunched posture, wearing dark three-piece suit, with gold ring on right hand, pocket watch',
 'character2': 'Johnny Fontane: late 30s, male, short, slicked-back black hair, clean shaven, slim and fit, wearing dark, stylish suit with an open collar, with gold ring, cigarette',
 'character3': 'Tom Hagen: German-Irish, early 40s, male, short, neatly combed brown hair, clean-shaven, medium build, upright posture, wearing gray suit, dark tie',
 'style': 'rough b&w pencil sketch, simple sketch lines, minimal shading, rough hatching, draft-style, J.C. Leyendecker style',
 'environment': "Don's office, daytime, summer 1945. Elegant wood-paneled room with large desk, leather chairs, warm lighting filtering through venetian blinds.",
 'shot': 'Medium Shot, Front View',
 'description': 'Don Corleone stands imposingly behind desk, face stern with righteous anger, pointing finger at 

In [14]:
def format_subprompts_for_diffusion(scene_prompts, prompt_weights = [2, 1.0, 1.2, 1.5, 0.9]):

    # Define weighting rules (can be tuned)
    weights = {
        "style": prompt_weights[0],
        "environment": prompt_weights[1],
        "shot": prompt_weights[2],
        "description": prompt_weights[3]
    }

    # Assign slightly higher weight to character details
    character_weight = prompt_weights[4]

    subprompt_texts = []
    subprompt_weights = []

    for key, text in scene_prompts.items():
        # Give higher weight to characters
        if key.startswith("character"):
            subprompt_texts.append(text)
            subprompt_weights.append(character_weight)
        else:
            subprompt_texts.append(text)
            subprompt_weights.append(weights.get(key, 1.0))  # Default weight is 1.0

    return subprompt_texts, subprompt_weights

In [15]:
format_subprompts_for_diffusion(all_scenes_to_prompts(scenes, characters_dict)[0])

(['Don Vito Corleone: Italian-American, early 60s, male, slicked-back gray-black hair, stocky, slightly hunched posture, wearing dark three-piece suit, with gold ring on right hand, pocket watch',
  'Johnny Fontane: late 30s, male, short, slicked-back black hair, clean shaven, slim and fit, wearing dark, stylish suit with an open collar, with gold ring, cigarette',
  'Tom Hagen: German-Irish, early 40s, male, short, neatly combed brown hair, clean-shaven, medium build, upright posture, wearing gray suit, dark tie',
  'rough b&w pencil sketch, simple sketch lines, minimal shading, rough hatching, draft-style, J.C. Leyendecker style',
  "Don's office, daytime, summer 1945. Elegant wood-paneled room with large desk, leather chairs, warm lighting filtering through venetian blinds.",
  'Medium Shot, Front View',
  'Don Corleone stands imposingly behind desk, face stern with righteous anger, pointing finger at Johnny. Johnny appears embarrassed, head slightly bowed. Hagen stands to the right

In [16]:
import torch
from diffusers import StableDiffusionPipeline, UniPCMultistepScheduler

In [17]:
def weighted_sum_prompt_embeddings(pipe, subprompt_texts, subprompt_weights, device, num_images_per_prompt=1):
    """
    Computes a weighted sum of text embeddings for a list of subprompts.
    
    Args:
        pipe: The Stable Diffusion pipeline instance.
        subprompt_texts (List[str]): List of subprompt strings.
        subprompt_weights (List[float]): Corresponding weights for each subprompt.
        device (str): The device to run on ("cuda" or "cpu").
        num_images_per_prompt (int): Number of images to generate per prompt.
    
    Returns:
        torch.Tensor: Combined prompt embeddings of shape (batch_size * num_images_per_prompt, seq_len, embed_dim)
    """
    encoded_prompts = []
    for text in subprompt_texts:
        # Tokenize the subprompt text
        text_inputs = pipe.tokenizer(
            text,
            padding="max_length",
            max_length=pipe.tokenizer.model_max_length,
            truncation=True,
            return_tensors="pt",
        )
        input_ids = text_inputs.input_ids.to(device)
        attention_mask = text_inputs.attention_mask.to(device) if "attention_mask" in text_inputs else None

        # Encode the subprompt into text embeddings
        text_embeds = pipe.text_encoder(input_ids, attention_mask=attention_mask)[0]
        encoded_prompts.append(text_embeds)
    
    # Compute the weighted sum of the embeddings
    weighted_embedding = sum(weight * embeds for weight, embeds in zip(subprompt_weights, encoded_prompts))
    weight_total = sum(subprompt_weights)
    combined_embedding = weighted_embedding / weight_total  # Normalize if desired

    # Duplicate embeddings for each image per prompt if necessary
    batch_size, seq_len, embed_dim = combined_embedding.shape
    combined_embedding = combined_embedding.repeat(1, num_images_per_prompt, 1)
    combined_embedding = combined_embedding.view(batch_size * num_images_per_prompt, seq_len, embed_dim)
    
    return combined_embedding

In [18]:
device = "cuda" if torch.cuda.is_available() else "cpu"
pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
pipe = pipe.to(device)
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
pipe.enable_model_cpu_offload()
pipe.enable_attention_slicing()

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

In [19]:
# scene_prompts = _scene_to_prompts(scenes[1], characters_dict)
# subprompt_texts, subprompt_weights = format_subprompts_for_diffusion(scene_prompts)
# combined_embeddings = weighted_sum_prompt_embeddings(pipe, subprompt_texts, subprompt_weights, device)
# with torch.no_grad():
#     output = pipe(prompt_embeds=combined_embeddings,
#                   negative_prompt="low quality, photorealistic, 3d render, overly detailed, digital art, painting, vibrant colors, fine art, NSFW",
#                   num_inference_steps=50)
# generated_image = output.images[0]
# generated_image.save("weighted_prompt_image1newnewnewnewnew.png")

In [20]:
import os
import torch

def generate_and_save_images(scenes, characters_dict, pipe, save_dir, device,
                             negative_prompt="low quality, photorealistic, 3d render, overly detailed, digital art, painting, vibrant colors, fine art, NSFW",
                             num_inference_steps=50):
    """
    Generate images for each scene using prompt embeddings from the provided pipeline
    and save each image to the specified directory with a unique filename.

    Args:
        scenes (list): List of scene objects.
        characters_dict (dict): Dictionary of character descriptions.
        pipe: The Stable Diffusion pipeline instance.
        save_dir (str): The directory where images will be saved.
        device (str): The device to use ("cuda" or "cpu").
        negative_prompt (str, optional): Negative prompt to steer generation.
        num_inference_steps (int, optional): Number of inference steps for image generation.

    Returns:
        list: List of generated PIL.Image objects.
    """
    os.makedirs(save_dir, exist_ok=True)
    generated_images = []
    for i, scene in enumerate(scenes):
        scene_prompts = _scene_to_prompts(scene, characters_dict)
        subprompt_texts, subprompt_weights = format_subprompts_for_diffusion(scene_prompts)
        combined_embeddings = weighted_sum_prompt_embeddings(pipe, subprompt_texts, subprompt_weights, device)
        
        with torch.no_grad():
            output = pipe(prompt_embeds=combined_embeddings,
                          negative_prompt=negative_prompt,
                          num_inference_steps=num_inference_steps)
        generated_image = output.images[0]
        generated_images.append(generated_image)
        image_path = os.path.join(save_dir, f"image_{i+1}.png")
        generated_image.save(image_path)
        print(f"Image {i+1} saved to {image_path}")
        
    return generated_images

In [None]:
save_directory = "stories/prompt_weight"
generated_images = generate_and_save_images(scenes, characters_dict, pipe, save_directory, device)

  0%|          | 0/50 [00:00<?, ?it/s]

Image 1 saved to stories/prompt_weight\image_1.png


  0%|          | 0/50 [00:00<?, ?it/s]

Image 2 saved to stories/prompt_weight\image_2.png


  0%|          | 0/50 [00:00<?, ?it/s]

Image 3 saved to stories/prompt_weight\image_3.png


  0%|          | 0/50 [00:00<?, ?it/s]

Image 4 saved to stories/prompt_weight\image_4.png


  0%|          | 0/50 [00:00<?, ?it/s]

Image 5 saved to stories/prompt_weight\image_5.png


  0%|          | 0/50 [00:00<?, ?it/s]

Image 6 saved to stories/prompt_weight\image_6.png


  0%|          | 0/50 [00:00<?, ?it/s]

Image 7 saved to stories/prompt_weight\image_7.png


  0%|          | 0/50 [00:00<?, ?it/s]

Image 8 saved to stories/prompt_weight\image_8.png


  0%|          | 0/50 [00:00<?, ?it/s]

# OLD

In [None]:
# class Scene(BaseModel):
#     scene_number: int = Field(description="The sequential number of the scene")
#     description: str = Field(description="A single-moment scene optimized for diffusion model sketch prompting")
#     orientation: str = Field(description="Character orientation chosen from predefined list")
#     expression: str = Field(description="Character facial expressions with names, e.g., 'John is worried'")
#     setting: str = Field(description="Where is it happening? Time of day? Environmental details?")
#     shot_type: str = Field(description="Camera shot chosen from predefined list")
#     character_positions: str = Field(description="Where are the characters in the scene?")

# class SceneList(BaseModel):
#     scenes: List[Scene]

# class CharacterInScene(BaseModel):
#     name: str
#     description: str
#     action: str
#     expression: str

# class Scene(BaseModel):
#     scene_number: int
#     description: str
#     shot_type: str
#     orientation: str
#     characters: List[CharacterInScene]
#     character_positions: str

# class SceneList(BaseModel):
#     scenes: List[Scene]

# def build_character_description(char: Dict[str, str]) -> str:
#     """Generates a concise physical description from a character dictionary."""
#     features = [
#         char.get("ethnicity", ""),
#         char.get("age", ""),
#         char.get("gender", ""),
#         char.get("hair", ""),
#         char.get("facial_hair", ""),
#         char.get("body_type", ""),
#         f"wearing {char.get('clothing', '')}",
#         f"with {char.get('accessories', '')}" if char.get("accessories") else ""
#     ]
#     return ", ".join(filter(None, features))

In [None]:
# def extract_scenes(story_text: str, characters: Dict[str, Dict], num_scenes=5, style="sketch storyboard") -> List[Scene]:
#     """Extracts structured storyboard scenes optimized for diffusion model prompting."""
    
#     # character_descriptions = {name: build_character_description(desc) for name, desc in characters.items()}

#     messages = [
#         {"role": "system", "content": "You are an AI specialized in structured storyboard scene generation for diffusion models."},
#         {"role": "user", "content": f'''
# The following story needs to be split into {num_scenes} structured storyboard scenes.

# ### **Story:**
# "{story_text}"

# ### **Character Descriptions:**
# These characters appear in the storyboard with **consistent appearance**:
# {json.dumps(character_descriptions, indent=2)}

# ### **Scene Format & Requirements**
# - Each scene represents a **single distinct moment** (no multiple actions).
# - **Characters are placed in the scene** with relative positions: (foreground, background, left, right, etc.).
# - **Shot Type & Orientation** must be defined.
# - **Each character's description is referenced**, but avoid full repetition.
# - Expressions & actions must be directly tied to the characters.

# ### **Output Format (JSON)**
# Each scene should include:
# 1. `"scene_number"`: Sequential index.
# 2. `"description"`: A single-moment, vivid summary.
# 3. `"shot_type"`: Camera shot from {CAMERA_SHOTS}.
# 4. `"orientation"`: Character orientation from {ORIENTATIONS}.
# 5. `"characters"`: A list of objects, each with:
#    - `"name"`: Character name
#    - `"description"`: Shortened version from provided character descriptions
#    - `"action"`: What the character is actively doing
#    - `"expression"`: Emotional state
# 6. `"character_positions"`: A sentence explaining where each character is in the frame.

# Return **valid JSON** with `"scenes"` as a list.
# '''}
#     ]

#     response = together.chat.completions.create(
#         model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
#         messages=messages,
#         max_tokens=10000,
#         temperature=0.7,
#         response_format={"type": "json_object", "schema": SceneList.model_json_schema()}
#     )

#     try:
#         output_json = response.choices[0].message.content
#         return json.loads(output_json)["scenes"]  # Extract only the scenes list
#     except (json.JSONDecodeError, KeyError) as e:
#         print("Error: Could not parse JSON output", e)
#         return []

In [None]:
# story_text = """
# Late in the evening, Don Vito Corleone sits quietly in his office, the dim glow of a single lamp illuminating
# his desk. Tom Hagen enters with an urgent message about a family matter, his brow furrowed with concern.
# They exchange a few words in hushed tones, aware of how each stray sound can echo in the silent corridors.

# In the adjacent parlor, Johnny Fontane, fresh from a performance, sips whiskey while contemplating
# his next move. His gaze shifts between the gilded clock on the wall and the doorway leading to Don Vito's office.
# Outside, the autumn wind rustles through the garden, carrying with it the faint scent of oranges.

# Don Vito stands to greet Johnny, inviting him to share what's on his mind. Tom Hagen steps aside, quietly
# observing their exchange, ready to step in with legal advice if needed. The tension in the air is palpable
# as old loyalties and new opportunities hang in the balance. 
# """

# characters = {
#     "Don Vito Corleone": {
#         "age": "early 60s", "gender": "male", "hair": "slicked-back gray-black hair",
#         "clothing": "dark three-piece suit",
#         "body_type": "stocky, slightly hunched posture",
#         "accessories": "gold ring on right hand, pocket watch",
#         "ethnicity": "Italian-American"
#     },
#     "Tom Hagen": {
#         "age": "early 40s", "gender": "male", "hair": "short, neatly combed brown hair",
#         "facial_hair": "clean-shaven", "clothing": "gray suit, dark tie",
#         "body_type": "medium build, upright posture", "ethnicity": "German-Irish"
#     },
#     "Johnny Fontane": {
#         "age": "late 30s", "gender": "male", "hair": "short, slicked-back black hair",
#         "facial_hair": "clean shaven", "clothing": "dark, stylish suit with an open collar",
#         "body_type": "slim and fit", "accessories": "gold ring, cigarette"
#     }
# }

In [None]:
# scenes = extract_scenes(story_text, characters, num_scenes=5)
# print(json.dumps(scenes, indent=4))

[
    {
        "scene_number": 1,
        "description": "Don Vito sits quietly in his office, awaiting a message.",
        "shot_type": "Establishing Shot",
        "orientation": "Front View",
        "characters": [
            {
                "name": "Don Vito Corleone",
                "description": "Italian-American, early 60s, male",
                "action": "sitting",
                "expression": "serious"
            },
            {
                "name": "Tom Hagen",
                "description": "German-Irish, early 40s, male",
                "action": "entering",
                "expression": "concerned"
            }
        ],
        "character_positions": "Don Vito is in the foreground, and Tom Hagen is in the background."
    },
    {
        "scene_number": 2,
        "description": "Johnny Fontane sips whiskey in the parlor, contemplating his next move.",
        "shot_type": "Medium Close-Up",
        "orientation": "Three-Quarters View",
        "charact

In [None]:
# def scene_to_prompt(scene, style="rough storyboard sketch"):
#     """
#     Converts a structured scene JSON into an optimized prompt for Stable Diffusion.

#     Parameters:
#     - scene (dict): A single scene's structured representation.
#     - style (str): The desired artistic style (default is "rough storyboard sketch").

#     Returns:
#     - str: A structured text prompt optimized for image generation.
#     """

#     # Extract core elements
#     description = scene["description"]
#     shot_type = scene["shot_type"]
#     orientation = scene["orientation"]
#     character_positions = scene["character_positions"]

#     # Build character descriptions with actions & expressions
#     character_details = []
#     for char in scene["characters"]:
#         char_text = (
#             f"{char['description']}, {char['action']}, "
#             f"{char['expression']} expression,"
#         )
#         character_details.append(char_text)
    
#     character_text = " and ".join(character_details)  # Separate multiple characters with "|"

#     # Construct final structured prompt
#     prompt = (
#         f"{style}. With {character_text}."
#         f"{description}"
#         f"{shot_type}, {orientation}. "
#         f"{character_positions}. "
#     )

#     return prompt

# def generate_all_prompts(scenes, style="rough storyboard sketch"):
#     """
#     Converts all scenes into structured diffusion model prompts.

#     Parameters:
#     - scenes (list): List of structured scenes.
#     - style (str): The desired artistic style.

#     Returns:
#     - list: List of formatted text prompts.
#     """
#     return [scene_to_prompt(scene, style) for scene in scenes]

# print(scene_to_prompt(scenes[0]))

rough storyboard sketch, simple sketch lines, minimal shading, rough hatching, draft-style. With Italian-American, early 60s, male, sitting, serious expression, and German-Irish, early 40s, male, entering, concerned expression,.Don Vito sits quietly in his office, awaiting a message.Establishing Shot, Front View. Positioning: Don Vito is in the foreground, and Tom Hagen is in the background.. 


In [None]:
# # Convert all scenes and print
# prompts = generate_all_prompts(scenes)
# for i, prompt in enumerate(prompts):
#     print(f"Scene {i+1} Prompt:\n{prompt}\n")

Scene 1 Prompt:
rough storyboard sketch, simple sketch lines, minimal shading, rough hatching, draft-style. With Italian-American, early 60s, male, sitting, serious expression, and German-Irish, early 40s, male, entering, concerned expression,.Don Vito sits quietly in his office, awaiting a message.Establishing Shot, Front View. Positioning: Don Vito is in the foreground, and Tom Hagen is in the background.. 

Scene 2 Prompt:
rough storyboard sketch, simple sketch lines, minimal shading, rough hatching, draft-style. With late 30s, male, sipping whiskey, contemplative expression, and German-Irish, early 40s, male, observing, neutral expression,.Johnny Fontane sips whiskey in the parlor, contemplating his next move.Medium Close-Up, Three-Quarters View. Positioning: Johnny Fontane is in the foreground, with Tom Hagen in the background, observing from across the room.. 

Scene 3 Prompt:
rough storyboard sketch, simple sketch lines, minimal shading, rough hatching, draft-style. With Italian

# Prompt Weighting

In [None]:
# from diffusers import StableDiffusionPipeline
# import torch

# pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
# pipe.to("cuda")
# negative_prompt = "low_quality, photorealistic, 3d render, overly detailed, digital art, painting, vibrant colors, fine art, NSFW"

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

StableDiffusionPipeline {
  "_class_name": "StableDiffusionPipeline",
  "_diffusers_version": "0.32.1",
  "_name_or_path": "CompVis/stable-diffusion-v1-4",
  "feature_extractor": [
    "transformers",
    "CLIPImageProcessor"
  ],
  "image_encoder": [
    null,
    null
  ],
  "requires_safety_checker": true,
  "safety_checker": [
    "stable_diffusion",
    "StableDiffusionSafetyChecker"
  ],
  "scheduler": [
    "diffusers",
    "PNDMScheduler"
  ],
  "text_encoder": [
    "transformers",
    "CLIPTextModel"
  ],
  "tokenizer": [
    "transformers",
    "CLIPTokenizer"
  ],
  "unet": [
    "diffusers",
    "UNet2DConditionModel"
  ],
  "vae": [
    "diffusers",
    "AutoencoderKL"
  ]
}

In [None]:
# image = pipe(prompts[1], negative_prompt=negative_prompt).images[0]
# image.save("scene2_storyboard.png")
# generate_and_save_scene(scenes[1], pipe, negative_prompt, f"{path}/scene2.png")

The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['view . positioning : johnny fontane is in the foreground , with tom hagen in the background , observing from across the room ..']


  0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
# image = pipe(prompts[2], negative_prompt=negative_prompt).images[0]
# image.save("scene3_storyboard.png")
# generate_and_save_scene(scenes[2], pipe, negative_prompt, f"{path}/scene3.png")

The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: [': don vito is in the foreground , with johnny fontane behind him , looking over his shoulder ..']


  0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
# image = pipe(prompts[3], negative_prompt=negative_prompt).images[0]
# image.save("scene4_storyboard.png")
# generate_and_save_scene(scenes[3], pipe, negative_prompt, f"{path}/scene4.png")

  0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
# image = pipe(prompts[4], negative_prompt=negative_prompt).images[0]
# image.save("scene5_storyboard.png")
# generate_and_save_scene(scenes[4], pipe, negative_prompt, f"{path}/scene5.png")

The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['johnny are standing in the foreground , with tom hagen in the background , observing the scene ..']


  0%|          | 0/50 [00:00<?, ?it/s]