In [1]:
from dotenv import load_dotenv
import json
from together import Together
from pydantic import BaseModel, Field
from typing import List, Dict

In [2]:
load_dotenv()
together = Together() # add .env file with TOGETHER_API_KEY variable

In [3]:
ORIENTATIONS = [
    "Front View", "Profile View", "Back View", "From Behind", "From Above",
    "From Below", "Three-Quarters View", "Long Shot", "Three-Quarters Rear View"
]

# EXPRESSIONS = [
#     "afraid", "amused", "angry", "anxious", "ashamed", "bored", "confident",
#     "confused", "contempt", "curious", "depressed", "determined", "disgusted",
#     "ecstatic", "embarrassed", "enraged", "excited", "fear", "frightened", "frown",
#     "frustrated", "guilty", "happy", "hopeful", "hurt", "indifferent", "jealous",
#     "joyful", "miserable", "nervous", "neutral", "optimistic", "proud", "puzzled",
#     "relieved", "sad", "scared", "shocked", "shy", "skeptical", "sleepy", "smile",
#     "smug", "sorry", "stubborn", "surprised", "suspicious", "thoughtful", "tired",
#     "withdrawn", "worried"
# ]

CAMERA_SHOTS = [
    "Aerial View", "Bird’s-Eye View", "Close-Up", "Cowboy Shot", "Dolly Zoom",
    "Dutch Angle", "Establishing Shot", "Extreme Close-Up", "Extreme Long Shot",
    "Full Shot", "Long Shot", "Medium Close-Up", "Medium Long Shot", "Medium Shot",
    "Over-the-Shoulder Shot", "Point-of-View Shot", "Two-Shot", "Fisheye Shot",
    "Worm's Eye", "Low-Angle Shot", "Macro Shot", "Tilt-Shift Shot", "Telephoto Shot"
]

In [4]:
# class Scene(BaseModel):
#     scene_number: int = Field(description="The sequential number of the scene")
#     description: str = Field(description="A single-moment scene optimized for diffusion model sketch prompting")
#     orientation: str = Field(description="Character orientation chosen from predefined list")
#     expression: str = Field(description="Character facial expressions with names, e.g., 'John is worried'")
#     setting: str = Field(description="Where is it happening? Time of day? Environmental details?")
#     shot_type: str = Field(description="Camera shot chosen from predefined list")
#     character_positions: str = Field(description="Where are the characters in the scene?")

# class SceneList(BaseModel):
#     scenes: List[Scene]

class CharacterInScene(BaseModel):
    name: str
    description: str
    action: str
    expression: str

class Scene(BaseModel):
    scene_number: int
    description: str
    shot_type: str
    orientation: str
    characters: List[CharacterInScene]
    character_positions: str

class SceneList(BaseModel):
    scenes: List[Scene]

In [5]:
def build_character_description(char: Dict[str, str]) -> str:
    """Generates a concise physical description from a character dictionary."""
    features = [
        char.get("ethnicity", ""),
        char.get("age", ""),
        char.get("gender", ""),
        char.get("hair", ""),
        char.get("facial_hair", ""),
        char.get("body_type", ""),
        f"wearing {char.get('clothing', '')}",
        f"with {char.get('accessories', '')}" if char.get("accessories") else ""
    ]
    return ", ".join(filter(None, features))

In [None]:
def extract_scenes(story_text: str, characters: Dict[str, Dict], num_scenes=5, style="sketch storyboard") -> List[Scene]:
    """Extracts structured storyboard scenes optimized for diffusion model prompting."""
    
    character_descriptions = {name: build_character_description(desc) for name, desc in characters.items()}

    messages = [
        {"role": "system", "content": "You are an AI specialized in structured storyboard scene generation for diffusion models."},
        {"role": "user", "content": f'''
The following story needs to be split into {num_scenes} structured storyboard scenes.

### **Story:**
"{story_text}"

### **Character Descriptions:**
These characters appear in the storyboard with **consistent appearance**:
{json.dumps(character_descriptions, indent=2)}

### **Scene Format & Requirements**
- Each scene represents a **single distinct moment** (no multiple actions).
- **Characters are placed in the scene** with relative positions: (foreground, background, left, right, etc.).
- **Shot Type & Orientation** must be defined.
- **Each character's description is referenced**, but avoid full repetition.
- Expressions & actions must be directly tied to the characters.

### **Output Format (JSON)**
Each scene should include:
1. `"scene_number"`: Sequential index.
2. `"description"`: A single-moment, vivid summary.
3. `"shot_type"`: Camera shot from {CAMERA_SHOTS}.
4. `"orientation"`: Character orientation from {ORIENTATIONS}.
5. `"characters"`: A list of objects, each with:
   - `"name"`: Character name
   - `"description"`: Shortened version from provided character descriptions
   - `"action"`: What the character is actively doing
   - `"expression"`: Emotional state
6. `"character_positions"`: A sentence explaining where each character is in the frame.

Return **valid JSON** with `"scenes"` as a list.
'''}
    ]

    response = together.chat.completions.create(
        model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
        messages=messages,
        max_tokens=10000,
        temperature=0.7,
        response_format={"type": "json_object", "schema": SceneList.model_json_schema()}
    )

    try:
        output_json = response.choices[0].message.content
        return json.loads(output_json)["scenes"]  # Extract only the scenes list
    except (json.JSONDecodeError, KeyError) as e:
        print("Error: Could not parse JSON output", e)
        return []

In [19]:
story_text = """
Late in the evening, Don Vito Corleone sits quietly in his office, the dim glow of a single lamp illuminating
his desk. Tom Hagen enters with an urgent message about a family matter, his brow furrowed with concern.
They exchange a few words in hushed tones, aware of how each stray sound can echo in the silent corridors.

In the adjacent parlor, Johnny Fontane, fresh from a performance, sips whiskey while contemplating
his next move. His gaze shifts between the gilded clock on the wall and the doorway leading to Don Vito's office.
Outside, the autumn wind rustles through the garden, carrying with it the faint scent of oranges.

Don Vito stands to greet Johnny, inviting him to share what's on his mind. Tom Hagen steps aside, quietly
observing their exchange, ready to step in with legal advice if needed. The tension in the air is palpable
as old loyalties and new opportunities hang in the balance. 
"""

characters = {
    "Don Vito Corleone": {
        "age": "early 60s", "gender": "male", "hair": "slicked-back gray-black hair",
        "clothing": "dark three-piece suit",
        "body_type": "stocky, slightly hunched posture",
        "accessories": "gold ring on right hand, pocket watch",
        "ethnicity": "Italian-American"
    },
    "Tom Hagen": {
        "age": "early 40s", "gender": "male", "hair": "short, neatly combed brown hair",
        "facial_hair": "clean-shaven", "clothing": "gray suit, dark tie",
        "body_type": "medium build, upright posture", "ethnicity": "German-Irish"
    },
    "Johnny Fontane": {
        "age": "late 30s", "gender": "male", "hair": "short, slicked-back black hair",
        "facial_hair": "clean shaven", "clothing": "dark, stylish suit with an open collar",
        "body_type": "slim and fit", "accessories": "gold ring, cigarette"
    }
}

In [20]:
scenes = extract_scenes(story_text, characters, num_scenes=5)
print(json.dumps(scenes, indent=4))

[
    {
        "scene_number": 1,
        "description": "Don Vito sits quietly in his office, awaiting a message.",
        "shot_type": "Establishing Shot",
        "orientation": "Front View",
        "characters": [
            {
                "name": "Don Vito Corleone",
                "description": "Italian-American, early 60s, male",
                "action": "sitting",
                "expression": "serious"
            },
            {
                "name": "Tom Hagen",
                "description": "German-Irish, early 40s, male",
                "action": "entering",
                "expression": "concerned"
            }
        ],
        "character_positions": "Don Vito is in the foreground, and Tom Hagen is in the background."
    },
    {
        "scene_number": 2,
        "description": "Johnny Fontane sips whiskey in the parlor, contemplating his next move.",
        "shot_type": "Medium Close-Up",
        "orientation": "Three-Quarters View",
        "charact

In [None]:
def scene_to_prompt(scene, style="rough storyboard sketch"):
    """
    Converts a structured scene JSON into an optimized prompt for Stable Diffusion.

    Parameters:
    - scene (dict): A single scene's structured representation.
    - style (str): The desired artistic style (default is "rough storyboard sketch").

    Returns:
    - str: A structured text prompt optimized for image generation.
    """

    # Extract core elements
    description = scene["description"]
    shot_type = scene["shot_type"]
    orientation = scene["orientation"]
    character_positions = scene["character_positions"]

    # Build character descriptions with actions & expressions
    character_details = []
    for char in scene["characters"]:
        char_text = (
            f"{char['description']}, {char['action']}, "
            f"{char['expression']} expression,"
        )
        character_details.append(char_text)
    
    character_text = " and ".join(character_details)  # Separate multiple characters with "|"

    # Construct final structured prompt
    prompt = (
        f"{style}. With {character_text}."
        f"{description}"
        f"{shot_type}, {orientation}. "
        f"{character_positions}. "
    )

    return prompt

def generate_all_prompts(scenes, style="rough storyboard sketch"):
    """
    Converts all scenes into structured diffusion model prompts.

    Parameters:
    - scenes (list): List of structured scenes.
    - style (str): The desired artistic style.

    Returns:
    - list: List of formatted text prompts.
    """
    return [scene_to_prompt(scene, style) for scene in scenes]

print(scene_to_prompt(scenes[0]))

rough storyboard sketch, simple sketch lines, minimal shading, rough hatching, draft-style. With Italian-American, early 60s, male, sitting, serious expression, and German-Irish, early 40s, male, entering, concerned expression,.Don Vito sits quietly in his office, awaiting a message.Establishing Shot, Front View. Positioning: Don Vito is in the foreground, and Tom Hagen is in the background.. 


In [22]:
# Convert all scenes and print
prompts = generate_all_prompts(scenes)
for i, prompt in enumerate(prompts):
    print(f"Scene {i+1} Prompt:\n{prompt}\n")

Scene 1 Prompt:
rough storyboard sketch, simple sketch lines, minimal shading, rough hatching, draft-style. With Italian-American, early 60s, male, sitting, serious expression, and German-Irish, early 40s, male, entering, concerned expression,.Don Vito sits quietly in his office, awaiting a message.Establishing Shot, Front View. Positioning: Don Vito is in the foreground, and Tom Hagen is in the background.. 

Scene 2 Prompt:
rough storyboard sketch, simple sketch lines, minimal shading, rough hatching, draft-style. With late 30s, male, sipping whiskey, contemplative expression, and German-Irish, early 40s, male, observing, neutral expression,.Johnny Fontane sips whiskey in the parlor, contemplating his next move.Medium Close-Up, Three-Quarters View. Positioning: Johnny Fontane is in the foreground, with Tom Hagen in the background, observing from across the room.. 

Scene 3 Prompt:
rough storyboard sketch, simple sketch lines, minimal shading, rough hatching, draft-style. With Italian

In [11]:
from diffusers import StableDiffusionPipeline
# from diffusers import DiffusionPipeline
import torch

# Load Stable Diffusion model
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
# pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16)
# pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", torch_dtype=torch.float16)
pipe.to("cuda")  # Move to GPU if available

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

StableDiffusionPipeline {
  "_class_name": "StableDiffusionPipeline",
  "_diffusers_version": "0.32.1",
  "_name_or_path": "CompVis/stable-diffusion-v1-4",
  "feature_extractor": [
    "transformers",
    "CLIPImageProcessor"
  ],
  "image_encoder": [
    null,
    null
  ],
  "requires_safety_checker": true,
  "safety_checker": [
    "stable_diffusion",
    "StableDiffusionSafetyChecker"
  ],
  "scheduler": [
    "diffusers",
    "PNDMScheduler"
  ],
  "text_encoder": [
    "transformers",
    "CLIPTextModel"
  ],
  "tokenizer": [
    "transformers",
    "CLIPTokenizer"
  ],
  "unet": [
    "diffusers",
    "UNet2DConditionModel"
  ],
  "vae": [
    "diffusers",
    "AutoencoderKL"
  ]
}

In [12]:
negative_prompt = "photorealistic, 3d render, overly detailed, digital art, painting, vibrant colors, fine art, NSFW"

In [23]:
# Generate image from first prompt
image = pipe(prompts[0], negative_prompt=negative_prompt).images[0]

# Save image
image.save("scene1_storyboard.png")

The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['vito is in the foreground , and tom hagen is in the background ..']


  0%|          | 0/50 [00:00<?, ?it/s]

In [24]:
# Generate image from first prompt
image = pipe(prompts[1], negative_prompt=negative_prompt).images[0]
# Save image
image.save("scene2_storyboard.png")

The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['view . positioning : johnny fontane is in the foreground , with tom hagen in the background , observing from across the room ..']


  0%|          | 0/50 [00:00<?, ?it/s]

In [25]:
# Generate image from first prompt
image = pipe(prompts[2], negative_prompt=negative_prompt).images[0]
# Save image
image.save("scene3_storyboard.png")

The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: [': don vito is in the foreground , with johnny fontane behind him , looking over his shoulder ..']


  0%|          | 0/50 [00:00<?, ?it/s]

In [26]:
# Generate image from first prompt
image = pipe(prompts[3], negative_prompt=negative_prompt).images[0]
# Save image
image.save("scene4_storyboard.png")

  0%|          | 0/50 [00:00<?, ?it/s]

In [27]:
# Generate image from first prompt
image = pipe(prompts[4], negative_prompt=negative_prompt).images[0]
# Save image
image.save("scene5_storyboard.png")

The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['johnny are standing in the foreground , with tom hagen in the background , observing the scene ..']


  0%|          | 0/50 [00:00<?, ?it/s]