In [1]:
from dotenv import load_dotenv
import json
from together import Together
from pydantic import BaseModel, Field
from typing import List 

In [None]:
load_dotenv()
together = Together() # add .env file with TOGETHER_API_KEY variable

In [3]:
ORIENTATIONS = [
    "Front View", "Profile View", "Back View", "From Behind", "From Above",
    "From Below", "Three-Quarters View", "Long Shot", "Three-Quarters Rear View"
]

EXPRESSIONS = [
    "afraid", "amused", "angry", "anxious", "ashamed", "bored", "confident",
    "confused", "contempt", "curious", "depressed", "determined", "disgusted",
    "ecstatic", "embarrassed", "enraged", "excited", "fear", "frightened", "frown",
    "frustrated", "guilty", "happy", "hopeful", "hurt", "indifferent", "jealous",
    "joyful", "miserable", "nervous", "neutral", "optimistic", "proud", "puzzled",
    "relieved", "sad", "scared", "shocked", "shy", "skeptical", "sleepy", "smile",
    "smug", "sorry", "stubborn", "surprised", "suspicious", "thoughtful", "tired",
    "withdrawn", "worried"
]

CAMERA_SHOTS = [
    "Aerial View", "Bird’s-Eye View", "Close-Up", "Cowboy Shot", "Dolly Zoom",
    "Dutch Angle", "Establishing Shot", "Extreme Close-Up", "Extreme Long Shot",
    "Full Shot", "Long Shot", "Medium Close-Up", "Medium Long Shot", "Medium Shot",
    "Over-the-Shoulder Shot", "Point-of-View Shot", "Two-Shot", "Fisheye Shot",
    "Worm's Eye", "Low-Angle Shot", "Macro Shot", "Tilt-Shift Shot", "Telephoto Shot"
]

In [4]:
class Scene(BaseModel):
    scene_number: int = Field(description="The sequential number of the scene")
    description: str = Field(description="A single-moment scene optimized for diffusion model sketch prompting")
    orientation: str = Field(description="Character orientation chosen from predefined list")
    expression: str = Field(description="Character facial expressions with names, e.g., 'John is worried'")
    setting: str = Field(description="Where is it happening? Time of day? Environmental details?")
    shot_type: str = Field(description="Camera shot chosen from predefined list")

class SceneList(BaseModel):
    scenes: List[Scene]

In [5]:
def extract_scenes(story_text, characters, num_scenes=5, style="sketch storyboard"):
    """
    Extracts structured storyboard scenes from a story using Meta's Llama-3 via Together AI.

    Parameters:
    - story_text (str): The full story provided by the user.
    - num_scenes (int): Desired number of storyboard scenes.
    - characters (dict): Dictionary where each key is a character's name and each value is their description.
    - style (str): The preferred visual style (default is "sketch storyboard").

    Returns:
    - List of structured storyboard scenes optimized for diffusion model prompting.
    """

    # Convert character descriptions into a structured reference
    character_details = "\n".join([f"- {name}: {desc}" for name, desc in characters.items()])

    messages = [
        {"role": "system", "content": "You are an AI specialized in creating optimized storyboard scene descriptions for diffusion models."},
        {"role": "user", "content": f"""
        The user has provided the following story:
        "{story_text}"

        ### **Storyboard Generation Instructions**
        - Expand the story logically to create a **continuous, structured visual narrative**.
        - Ensure **smooth transitions** between scenes, making the story feel like a real movie storyboard.
        - Divide the story into exactly {num_scenes} scenes, each capturing a **single distinct moment**.
        - Camera angles must be **carefully chosen to evoke emotion and storytelling impact**.

        ### **General Character Descriptions**
        These characters appear throughout the storyboard.  
        Their **appearance, clothing, and defining traits remain consistent**:  
        {character_details}

        ### **Scene Description Guidelines for Diffusion Model Prompting**
        - Each scene must depict **one specific moment**, not multiple actions.
        - The description should be **highly visual and structured for AI image generation**.
        - **Do not** repeat character descriptions in every scene—focus on their actions and positioning.
        - Integrate the following elements into the scene description:
          - **Action**: Describe the key action taking place.
          - **Characters**: Do **not** list them separately—integrate them into the expression field.
          - **Expression**: Instead of listing emotions separately, attach them to character names.  
            Example: `"John looks nervous"` instead of `"expression: nervous"`.
          - **Setting**: Where is this happening? Time of day? Important visual cues.
          - **Shot Type**: Specify the camera angle without including it in the text.
          - **Style**: This is a **'{style}'**, meant for **rough sketch-based storyboarding**.

        ### **Output Format (JSON)**
        Return a **list** of {num_scenes} structured scenes in JSON format. Each scene should have:
        - "scene_number": The scene index.
        - "description": A single-moment description optimized for sketch-based image generation.
        - "orientation": The character’s body position, chosen from predefined options.
        - "expression": A sentence describing the emotions of the characters, including their names.
        - "setting": Where is it happening? Time of day? Environmental details?
        - "shot_type": The camera shot, chosen from predefined options.
        """}
    ]

    # call Together AI's API with structured JSON list output
    response = together.chat.completions.create(
        model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
        messages=messages,
        max_tokens=1500,
        temperature=0.7,
        response_format={"type": "json_object", "schema": SceneList.model_json_schema()}
    )

    # extract response text and parse it
    try:
        scenes = json.loads(response.choices[0].message.content)["scenes"]
    except json.JSONDecodeError:
        print("Error: LLM output could not be parsed as JSON.")
        scenes = []

    return scenes

In [6]:
sample_story = """
John enters the dark alley, his footsteps echoing between the damp brick walls.
Suddenly, a shadowy figure appears at the far end under the flickering streetlamp.
"""

characters = {
    "John": "A tall man with short brown hair, blue eyes, and a leather jacket.",
    "Shadowy Figure": "A mysterious person in a long cloak, face obscured by darkness."
}

In [7]:
scenes = extract_scenes(
    story_text=sample_story, 
    characters=characters, 
    num_scenes=3
)

print(json.dumps(scenes, indent=4))

[
    {
        "scene_number": 1,
        "description": "John enters the dark alley, his footsteps echoing between the damp brick walls.",
        "orientation": "medium shot",
        "expression": "John looks cautious",
        "setting": "a dimly lit alley at night, with a high ceiling and brick walls.",
        "shot_type": "over-the-shoulder"
    },
    {
        "scene_number": 2,
        "description": "John pauses, his eyes fixed on the flickering streetlamp as the shadowy figure steps into view.",
        "orientation": "medium shot",
        "expression": "John's expression turns tense as he looks at the shadowy figure",
        "setting": "the same alley, now lit by a single flickering streetlamp.",
        "shot_type": "two-shot"
    },
    {
        "scene_number": 3,
        "description": "The shadowy figure raises its hood, and John takes a step back, his eyes fixed on the figure.",
        "orientation": "medium shot",
        "expression": "John looks fearful as the

In [8]:
def _scene_to_prompt(scene, style="storyboard sketch"):
    """
    Converts a structured scene representation into a short, optimized text prompt for Stable Diffusion.

    Parameters:
    - scene (dict): A single scene's structured representation.
    - style (str): The desired artistic style (default is "storyboard sketch").

    Returns:
    - str: A structured text prompt optimized for Stable Diffusion.
    """

    # Extract necessary details
    description = scene["description"]
    orientation = scene["orientation"]
    expression = scene["expression"]
    setting = scene["setting"]
    shot_type = scene["shot_type"]

    # **Optimized, Concise Prompt**
    prompt = (
        f"{style}. "
        f"{description} "
        f"{expression}. "
        f"{orientation}, {shot_type}. "
        f"Setting: {setting}."
    )

    return prompt

In [9]:
def generate_all_prompts(scenes, style="rough storyboard sketch"):
    """
    Converts all scenes into structured diffusion model prompts.

    Parameters:
    - scenes (list): List of structured scenes.
    - style (str): The desired artistic style.

    Returns:
    - list: List of formatted text prompts.
    """
    prompts = [_scene_to_prompt(scene, style) for scene in scenes]
    return prompts


In [10]:
generated_prompts = generate_all_prompts(scenes)

# Print results
for i, prompt in enumerate(generated_prompts):
    print(f"Prompt {i+1}:")
    print(prompt)
    print("\n" + "="*80 + "\n")

Prompt 1:
rough storyboard sketch. John enters the dark alley, his footsteps echoing between the damp brick walls. John looks cautious. medium shot, over-the-shoulder. Setting: a dimly lit alley at night, with a high ceiling and brick walls..


Prompt 2:
rough storyboard sketch. John pauses, his eyes fixed on the flickering streetlamp as the shadowy figure steps into view. John's expression turns tense as he looks at the shadowy figure. medium shot, two-shot. Setting: the same alley, now lit by a single flickering streetlamp..


Prompt 3:
rough storyboard sketch. The shadowy figure raises its hood, and John takes a step back, his eyes fixed on the figure. John looks fearful as the shadowy figure raises its hood. medium shot, close-up. Setting: the same alley, with the streetlamp casting eerie shadows..




In [11]:
from diffusers import StableDiffusionPipeline
import torch

# Load Stable Diffusion model
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
pipe.to("cuda")  # Move to GPU if available

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

StableDiffusionPipeline {
  "_class_name": "StableDiffusionPipeline",
  "_diffusers_version": "0.32.1",
  "_name_or_path": "CompVis/stable-diffusion-v1-4",
  "feature_extractor": [
    "transformers",
    "CLIPImageProcessor"
  ],
  "image_encoder": [
    null,
    null
  ],
  "requires_safety_checker": true,
  "safety_checker": [
    "stable_diffusion",
    "StableDiffusionSafetyChecker"
  ],
  "scheduler": [
    "diffusers",
    "PNDMScheduler"
  ],
  "text_encoder": [
    "transformers",
    "CLIPTextModel"
  ],
  "tokenizer": [
    "transformers",
    "CLIPTokenizer"
  ],
  "unet": [
    "diffusers",
    "UNet2DConditionModel"
  ],
  "vae": [
    "diffusers",
    "AutoencoderKL"
  ]
}

In [12]:
# Generate image from first prompt
image = pipe(generated_prompts[0]).images[0]

# Save image
image.save("scene1_storyboard.png")

  0%|          | 0/50 [00:00<?, ?it/s]

In [13]:
# Generate image from first prompt
image = pipe(generated_prompts[1]).images[0]

# Save image
image.save("scene2_storyboard.png")

  0%|          | 0/50 [00:00<?, ?it/s]

In [14]:
# Generate image from first prompt
image = pipe(generated_prompts[2]).images[0]

# Save image
image.save("scene3_storyboard.png")

  0%|          | 0/50 [00:00<?, ?it/s]