In [215]:
import dspy
import os
import json
import re
from dotenv import load_dotenv

load_dotenv()

# Configure DSPy with Gemini model
api_key = os.getenv("GEMINI_API_KEY", "")

In [228]:
lm = dspy.LM("gemini/gemini-2.5-flash", api_key=api_key)
dspy.configure(lm=lm)

In [217]:
class Descriptor(dspy.Signature):
    """
    Generate a detailed, structured description of a Star Wars (we only consider original trilogy) location/scene name using: `location_name` and `scene_action`.

    Rely on established canon knowledge about original trilogy instead, consider all location within context of original trilogy. 
    Use the provided scene action and dialogues to understand the context, atmosphere, and events that occur at this location.
    Do not invent unsupported facts. Leave output as '' if you cannot find any information.

    Output:
    Summary 3–6 sentences on the location’s role, history, and significance.
    Physical Description: key environmental or architectural traits.
    Narrative Function: how the location is used in stories or lore.
    Atmosphere: emotional tone and typical sentiments linked to the location.

    Go right into summary, no markdown formating, only plain text.
    Do not include "Summary" in the output.
    """

    location_name: str = dspy.InputField(description="The name of the location to describe. This can also be a place or the scene name.")
    scene_action: str = dspy.InputField(description="Action happened at this location/scene to provide context and atmosphere.")
    description: str = dspy.OutputField(description="A detailed, structured description of the location")

In [218]:
descriptor = dspy.ChainOfThought(Descriptor)

In [219]:
from pathlib import Path
from collections import defaultdict
from bs4 import BeautifulSoup

# Load location_characters.json to get all locations
with open('data/location_characters.json', 'r') as f:
    location_characters = json.load(f)

# Get all location keys
locations = list(location_characters.keys())
print(f"Found {len(locations)} locations to process")
print(f"First few locations: {locations[:5]}")

# Location normalization mapping to merge redundant location names
LOCATION_NORMALIZATION = {
    "Darth Vader's Star Destroyer": "Vader's Star Destroyer",
    "Imperial Stardestroyer": "Imperial Star Destroyer",
    "Luke's X-Wing Fighter": "Luke's X-Wing",
    "Main Hangar Deck": "Main Hangar",
    "Red Ten's Cockpit.": "Red Ten's Cockpit",
    "Sail Barge Observation Deck": "Sail Barge",
    "Snowspeeder Cockpit": "Snowspeeder",
    "Stolen Imperial Shuttle": "Imperial Shuttle",
    "Tatooine Sea": "Tatooine",
    "Red Leader's X-Wing": "Red Leader Starship",
    "Red Leader's Cockpit": "Red Leader Starship",
    "Red Leader's Fighter": "Red Leader Starship",
    "Read Leader's Cockpit": "Red Leader Starship",
    "Read Leader's X-Wing Fighter": "Red Leader Starship",
}

def clean_location_name(location):
    """Clean location name to match keys in location_characters.json"""
    if not location:
        return None
    
    # Remove scene number prefix (Return of the Jedi format: "3    INT DEATH STAR")
    location = re.sub(r"^[0-9]+\s+", "", location).strip()
    
    # Remove INT/EXT prefix (if present) and time of day
    location = re.sub(r"^(INT|EXT)[\.\s]+", "", location, flags=re.IGNORECASE).strip()
    location = re.sub(
        r"\s+[--]\s+(DAY|NIGHT|DAWN|DUSK|CONTINUOUS)$",
        "",
        location,
        flags=re.IGNORECASE,
    )
    
    # Remove details in parentheses
    location = re.sub(r"\s*\([^)]*\)\s*$", "", location)
    
    # Extract only the first part before the first dash to get the main location
    if " - " in location or " – " in location:
        location = re.split(r"\s+[-–]\s+", location)[0]
    
    # Clean up multiple spaces
    location = re.sub(r"\s+", " ", location).strip()
    
    # Convert to title case
    location = location.title()
    
    # Fix apostrophe capitalization issue (e.g., "Luke'S" -> "Luke's")
    location = re.sub(r"'S\b", "'s", location)
    
    # Apply location normalization mapping to merge redundant locations
    if location in LOCATION_NORMALIZATION:
        location = LOCATION_NORMALIZATION[location]
    
    return location if location else None

# Function to extract location dialogs and actions from HTML script files
def extract_location_content():
    """Extract all text content (dialogs + actions) for each location from HTML script files."""
    DATA_DIR = Path('data')
    html_files = [
        DATA_DIR / 'html' / 'Star-Wars-A-New-Hope.html',
        DATA_DIR / 'html' / 'Star-Wars-The-Empire-Strikes-Back.html',
        DATA_DIR / 'html' / 'Star-Wars-Return-of-the-Jedi.html'
    ]
    
    location_content_map = defaultdict(list)
    
    for html_file in html_files:
        if not html_file.exists():
            print(f"Warning: {html_file} not found, skipping...")
            continue
        
        print(f"Processing {html_file.name}...")
        
        with open(html_file, 'r', encoding='utf-8', errors='replace') as f:
            soup = BeautifulSoup(f.read(), 'html.parser')
        
        # Find the script table (same approach as scrape notebook)
        script_table = soup.find("td", class_="scrtext")
        if not script_table:
            print(f"Warning: Could not find scrtext table in {html_file.name}")
            continue
        
        pre_tag = script_table.find("pre")
        if not pre_tag:
            print(f"Warning: Could not find pre tag in {html_file.name}")
            continue
        
        script_text = pre_tag.get_text()
        lines = script_text.split("\n")
        
        current_location = None
        current_content = []
        
        for line in lines:
            line_stripped = line.strip()
            
            # Detect scene headings using regex
            # Format 1: "INT. LOCATION" or "EXT. LOCATION"
            # Format 2: "1    INT LOCATION" or "3    EXT LOCATION" (Return of the Jedi)
            # Format 3: "1    SPACE" (Return of the Jedi special)
            is_scene_heading = False
            
            if re.match(r"^[0-9]+\s+[A-Z]", line_stripped):
                is_scene_heading = True
            elif re.match(r"^(INT|EXT)\.\s+", line_stripped):
                is_scene_heading = True
            
            if is_scene_heading:
                # Save previous location's content
                if current_location:
                    cleaned_location = clean_location_name(current_location)
                    if cleaned_location and current_content:
                        # Join all content lines and add to map
                        content_text = "\n".join(current_content).strip()
                        if content_text:
                            location_content_map[cleaned_location].append(content_text)
                
                # Start new location
                current_location = line_stripped
                current_content = []
            else:
                # Add line to current location's content (if we have a location)
                if current_location and line_stripped:
                    current_content.append(line_stripped)
        
        # Don't forget the last location
        if current_location:
            cleaned_location = clean_location_name(current_location)
            if cleaned_location and current_content:
                content_text = "\n".join(current_content).strip()
                if content_text:
                    location_content_map[cleaned_location].append(content_text)
    
    # Combine all content for each location (in case same location appears multiple times)
    location_content_str = {
        loc: "\n\n".join(content_list) if content_list else ""
        for loc, content_list in location_content_map.items()
    }
    
    return location_content_str

# Extract content for all locations
print("\nExtracting location content (dialogs + actions) from script files...")
location_dialogs = extract_location_content()
print(f"Extracted content for {len(location_dialogs)} locations")
print(f"Sample locations with content: {list(location_dialogs.keys())[:5]}")

# Show sample of content for first location
if location_dialogs:
    first_loc = list(location_dialogs.keys())[0]
    sample_content = location_dialogs[first_loc][:200] if location_dialogs[first_loc] else ""
    print(f"\nSample content for '{first_loc}':")
    print(sample_content + "..." if len(sample_content) > 200 else sample_content)

Found 94 locations to process
First few locations: ['Another Cockpit', 'Asteroid Cave', 'Barge Observation Deck', 'Battlefield', "Biggs' Cockpit"]

Extracting location content (dialogs + actions) from script files...
Processing Star-Wars-A-New-Hope.html...
Processing Star-Wars-The-Empire-Strikes-Back.html...
Processing Star-Wars-Return-of-the-Jedi.html...
Extracted content for 131 locations
Sample locations with content: ['Rebel Blockade Runner', 'Spacecraft In Space', 'Tatooine', 'Imperial Star Destroyer', 'Lifepod']

Sample content for 'Rebel Blockade Runner':
An explosion rocks the ship as two robots, Artoo-Detoo (R2-
D2) and See-Threepio (C-3PO) struggle to make their way
through the shaking, bouncing passageway. Both robots are
old and battered. Artoo is


In [220]:
def generate_desc(location_name: str, scene_dialogs: str = ""):
    return descriptor(location_name=location_name, scene_action=scene_dialogs)

In [221]:
# Create examples for ALL 94 locations
# Include dialogs and actions for each location
examples = []

for location_name in locations:
    # Get content (dialogs + actions) for this location (empty string if not found)
    scene_dialogs = location_dialogs.get(location_name, "")
    
    # Create a dspy.Example with the inputs matching the Descriptor signature
    # Note: Descriptor uses 'scene_action' but we store it as 'scene_dialogs' for clarity
    example = dspy.Example(
        location_name=location_name,
        scene_action=scene_dialogs  # Map to scene_action to match Descriptor signature
    ).with_inputs('location_name', 'scene_action')
    examples.append(example)

print(f"Created {len(examples)} examples with location content")

Created 94 examples with location content


In [222]:
from dspy import Parallel

def call_generate_desc(**kwargs):
    """Wrapper to call generate_desc - accepts keyword arguments from Parallel"""
    return generate_desc(
        location_name=kwargs.get('location_name', ''),
        scene_dialogs=kwargs.get('scene_action', '')  # Map from scene_action to scene_dialogs parameter
    )


In [223]:
exec_pairs = [
    (call_generate_desc, example.inputs()) 
    for example in examples
]

# Create Parallel executor
parallel_executor = Parallel(
    num_threads=100,
    max_errors=None,
    access_examples=True,
    return_failed_examples=False,
    provide_traceback=True,
    disable_progress_bar=False
)


In [224]:
parallel_results = parallel_executor(exec_pairs)

Processed 94 / 94 examples: 100%|██████████| 94/94 [00:35<00:00,  2.64it/s]


In [225]:
output_data = []
for i, (example, result) in enumerate(zip(examples, parallel_results)):
    # result is the full prediction object from generate_desc
    # Access description and reasoning from the prediction object
    output_data.append({
        'location_name': example.location_name,
        'description': result.description if hasattr(result, 'description') else '',
        'reasoning': result.reasoning if hasattr(result, 'reasoning') else ''
    })

output_file = 'data/location_descriptions_llm_withSceneAction_gemini_2_5_flash.json'
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(output_data, f, indent=2, ensure_ascii=False)

print(f"Saved {len(output_data)} location descriptions with reasoning to {output_file}")

Saved 94 location descriptions with reasoning to data/location_descriptions_llm_withSceneAction_gemini_2_5_flash.json


In [226]:
def price_token_cost(lm=lm):
    cost = sum(x['cost'] for x in lm.history if x.get('cost') is not None)
    total_tokens_used = sum(
        x['usage']['total_tokens']
        for x in lm.history
        if x.get('usage') and x['usage'].get('total_tokens') is not None
    )
    print(f"Total tokens used: {total_tokens_used}")
    print(f"Total cost: ${cost:.6f}")

In [227]:
price_token_cost()

Total tokens used: 321642
Total cost: $0.412415
