In [11]:
from bs4 import BeautifulSoup
import re
import json
from collections import defaultdict
from pathlib import Path

# Set up data directory
DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)

In [12]:
# Setup scripts dictionary
scripts = {}
for movie in ["A New Hope", "The Empire Strikes Back", "Return of the Jedi"]:
    script_file = DATA_DIR / f"html/Star-Wars-{movie.replace(' ', '-')}.html"
    with open(script_file, "r", encoding="utf-8", errors="replace") as f:
        scripts[movie] = f.read()

In [13]:
def fetch_script_from_file(file_path):
    """Load a Star Wars script from local HTML file"""
    print(f"Loading: {file_path}")
    with open(file_path, "r", encoding="utf-8", errors="replace") as f:
        return f.read()


def extract_scenes_and_characters(html_content):
    """Extract scene headings and character names from script HTML"""
    soup = BeautifulSoup(html_content, "html.parser")

    # Find the table with class="scrtext" which marks the start of the actual script
    script_table = soup.find("td", class_="scrtext")
    if not script_table:
        print("Warning: Could not find scrtext table in HTML")
        return []

    # Find the pre tag within the script table
    pre_tag = script_table.find("pre")
    if not pre_tag:
        print("Warning: Could not find pre tag in script table")
        return []

    script_text = pre_tag.get_text()
    lines = script_text.split("\n")

    scenes = []
    current_scene = {"location": None, "characters": set(), "content": []}
    scene_started = False  # Track if we've reached the actual script content

    for line in lines:
        line_stripped = line.strip()

        # Detect scene headings
        # Format 1: "INT. LOCATION" or "EXT. LOCATION"
        # Format 2 (Return of the Jedi): "1    INT LOCATION" or "3    EXT LOCATION"
        # Format 3 (Return of the Jedi special): "1    SPACE" or "19   HOLDING TUNNEL"
        is_scene_heading = False

        # Check for numbered scene format (any line starting with number + spaces + capital letter)
        # This catches both "3    INT DEATH STAR" and "1    SPACE"
        if re.match(r"^[0-9]+\s+[A-Z]", line_stripped):
            is_scene_heading = True
            scene_started = True
        # Check for standard format (e.g., "INT. DEATH STAR")
        elif re.match(r"^(INT|EXT)\.\s+", line_stripped):
            is_scene_heading = True
            scene_started = True

        if is_scene_heading:
            # Save previous scene if it has a location (even with 0 characters)
            if current_scene["location"] is not None:
                scenes.append(current_scene)

            current_scene = {
                "location": line_stripped,
                "characters": set(),
                "content": [],
            }
        # Only process character names after we've started seeing actual scenes
        elif (
            scene_started
            and re.match(r"^[A-Z][A-Z\s\-\']*$", line_stripped)
            and len(line_stripped) > 1
            and len(line_stripped) < 50
        ):
            # This is likely a character name
            char_name = line_stripped.strip()
            # Filter out common non-character all-caps text
            if char_name and char_name not in [
                "FADE IN:",
                "FADE OUT:",
                "THE END",
                "CREDITS",
                "TO BE CONTINUED",
                "CUT TO:",
                "DISSOLVE TO:",
                "CONTINUED:",
            ]:
                current_scene["characters"].add(char_name)
                current_scene["content"].append(("CHARACTER", char_name))
        else:
            if line_stripped and scene_started:
                current_scene["content"].append(("DIALOGUE", line_stripped))

    # Don't forget the last scene
    if current_scene["location"] is not None:
        scenes.append(current_scene)

    return scenes


# Load all three scripts from local files
scripts = {}
scripts["A New Hope"] = fetch_script_from_file(
    DATA_DIR / "html" / "Star-Wars-A-New-Hope.html"
)
scripts["The Empire Strikes Back"] = fetch_script_from_file(
    DATA_DIR / "html" / "Star-Wars-The-Empire-Strikes-Back.html"
)
scripts["Return of the Jedi"] = fetch_script_from_file(
    DATA_DIR / "html" / "Star-Wars-Return-of-the-Jedi.html"
)

print("Scripts loaded successfully!")

# Extract scenes from each script
all_scenes = {}
for title, html in scripts.items():
    print(f"\nParsing {title}...")
    scenes = extract_scenes_and_characters(html)
    all_scenes[title] = scenes
    print(f"  Found {len(scenes)} total scenes")
    scenes_with_chars = [s for s in scenes if s["characters"]]
    print(f"  Scenes with characters: {len(scenes_with_chars)}")

    # Show sample
    if scenes:
        sample = scenes[0]
        print(f"  Sample scene: {sample['location']}")
        print(
            f"  Characters: {list(sample['characters'])[:5] if sample['characters'] else 'None'}"
        )  # Show first 5 characters

Loading: data/html/Star-Wars-A-New-Hope.html
Loading: data/html/Star-Wars-The-Empire-Strikes-Back.html
Loading: data/html/Star-Wars-Return-of-the-Jedi.html
Scripts loaded successfully!

Parsing A New Hope...
  Found 479 total scenes
  Scenes with characters: 258
  Sample scene: INT. REBEL BLOCKADE RUNNER - MAIN PASSAGEWAY
  Characters: ['THREEPIO']

Parsing The Empire Strikes Back...
  Found 277 total scenes
  Scenes with characters: 157
  Sample scene: EXT. GALAXY - PLANET HOTH
  Characters: None

Parsing Return of the Jedi...
  Found 137 total scenes
  Scenes with characters: 91
  Sample scene: 1    SPACE
  Characters: ['RETURN OF THE JEDI']


In [19]:
# Character name normalization mapping
# Maps all variations of character names to canonical names
CHARACTER_NORMALIZATION = {
    # Luke
    "LUKE": "Luke Skywalker",
    "LUKE SKYWALKER": "Luke Skywalker",
    "LUKE'S VOICE": "Luke Skywalker",
    # Han Solo
    "HAN": "Han Solo",
    "HAN SOLO": "Han Solo",
    "HAN'S VOICE": "Han Solo",
    # Leia
    "LEIA": "Leia Organa",
    "LEIA ORGANA": "Leia Organa",
    "LEIA'S VOICE": "Leia Organa",
    "PRINCESS LEIA": "Leia Organa",
    # Obi-Wan
    "OBI-WAN": "Obi-Wan Kenobi",
    "OBI WAN": "Obi-Wan Kenobi",
    "OBI-WAN KENOBI": "Obi-Wan Kenobi",
    "BEN": "Obi-Wan Kenobi",
    "OLD BEN": "Obi-Wan Kenobi",
    "OBI-WAN'S VOICE": "Obi-Wan Kenobi",
    "Ben'S Voice": "Obi-Wan Kenobi",
    "Ben's Voice": "Obi-Wan Kenobi",
    # Yoda
    "YODA": "Yoda",
    "YODA'S VOICE": "Yoda",
    # Darth Vader
    "VADER": "Darth Vader",
    "DARTH VADER": "Darth Vader",
    "LORD VADER": "Darth Vader",
    "VADER'S VOICE": "Darth Vader",
    # Emperor Palpatine
    "EMPEROR": "Emperor Palpatine",
    "PALPATINE": "Emperor Palpatine",
    "EMPEROR PALPATINE": "Emperor Palpatine",
    "THE EMPEROR": "Emperor Palpatine",
    "EMPEROR'S VOICE": "Emperor Palpatine",
    # Lando
    "LANDO": "Lando Calrissian",
    "LANDO CALRISSIAN": "Lando Calrissian",
    # Chewbacca
    "CHEWBACCA": "Chewbacca",
    "CHEWIE": "Chewbacca",
    "CHEW": "Chewbacca",
    # C-3PO and R2-D2
    "C-3PO": "C-3PO",
    "C3PO": "C-3PO",
    "THREEPIO": "C-3PO",
    "THREE-PO": "C-3PO",
    "R2-D2": "R2-D2",
    "R2D2": "R2-D2",
    "ARTOO": "R2-D2",
    # Other characters
    "LEIA'S HANDMAIDEN": "Leia's Handmaiden",
    "REBEL GENERAL": "Rebel General",
    "IMPERIAL OFFICER": "Imperial Officer",
    "STORMTROOPER": "Stormtrooper",
    "GUARD": "Guard",
    "OFFICER": "Officer",
    "PILOT": "Pilot",
    "REBEL PILOT": "Rebel Pilot",
    "X-WING PILOT": "X-Wing Pilot",
    # Other named characters
    "WEDGE": "Wedge Antilles",
    "WEDGE ANTILLES": "Wedge Antilles",
    "UNCLE OWEN": "Owen Lars",
    "OWEN": "Owen Lars",
    "AUNT BERU": "Beru Lars",
    "BERU": "Beru Lars",
    "BIGGS": "Biggs Darklighter",
    "BIGGS DARKLIGHTER": "Biggs Darklighter",
    "TARKIN": "Grand Moff Tarkin",
    "GRAND MOFF TARKIN": "Grand Moff Tarkin",
    "MOFF TARKIN": "Grand Moff Tarkin",
    "LEA": "Leia Organa",
    "SCENE": "Scene",
    "VOICE": "Voice",
}

CHARACTER_EXCLUSIONS = [
    "AREA",
    "DAY",
    "END CREDITS OVER STAR FIELD",
    "FADE OUT",
]


def normalize_character_name(name: str):
    """Normalize a character name to canonical form"""
    if not name or not isinstance(name, str):
        return None

    name_upper = name.upper().strip()

    if name_upper in CHARACTER_EXCLUSIONS:
        return None

    # Remove common suffixes
    name_upper = re.sub(r"\s*\(V\.O\.\)\s*$", "", name_upper)
    name_upper = re.sub(r"\s*\(CONT\'D\)\s*$", "", name_upper)
    name_upper = re.sub(r"\s*O\.S\.\s*$", "", name_upper)

    if name_upper in CHARACTER_NORMALIZATION:
        return CHARACTER_NORMALIZATION[name_upper]

    # If not found, clean up the name and return it
    if (
        name_upper
        and len(name_upper) > 2
        and name_upper not in ["THE", "AND", "OR", "TO"]
    ):
        return name_upper.title()

    return None


# Test the normalization
print("Testing character name normalization:")
test_names = ["LUKE", "Han", "leia", "OBI-WAN KENOBI", "VADER", "C-3PO"]
for name in test_names:
    print(f"  {name} → {normalize_character_name(name)}")

Testing character name normalization:
  LUKE → Luke Skywalker
  Han → Han Solo
  leia → Leia Organa
  OBI-WAN KENOBI → Obi-Wan Kenobi
  VADER → Darth Vader
  C-3PO → C-3PO


In [20]:
# Location normalization mapping to merge redundant location names
LOCATION_NORMALIZATION = {
    "Darth Vader's Star Destroyer": "Vader's Star Destroyer",
    "Imperial Stardestroyer": "Imperial Star Destroyer",
    "Luke's X-Wing Fighter": "Luke's X-Wing",
    "Main Hangar Deck": "Main Hangar",
    "Red Ten's Cockpit.": "Red Ten's Cockpit",
    "Sail Barge Observation Deck": "Sail Barge",
    "Snowspeeder Cockpit": "Snowspeeder",
    "Stolen Imperial Shuttle": "Imperial Shuttle",
    "Tatooine Sea": "Tatooine",
    "Red Leader's X-Wing": "Red Leader Starship",
    "Red Leader's Cockpit": "Red Leader Starship",
    "Red Leader's Fighter": "Red Leader Starship",
    "Read Leader's Cockpit": "Red Leader Starship",
    "Read Leader's X-Wing Fighter": "Red Leader Starship",
}


def clean_location_name(location):
    """Clean and normalize location names from scene headings"""
    if not location:
        return None

    # Remove scene number prefix (Return of the Jedi format: "3    INT DEATH STAR")
    location = re.sub(r"^[0-9]+\s+", "", location).strip()

    # Remove INT/EXT prefix (if present) and time of day
    # Use optional match since some scenes like "SPACE" don't have INT/EXT
    location = re.sub(r"^(INT|EXT)[\.\s]+", "", location, flags=re.IGNORECASE).strip()
    location = re.sub(
        r"\s+[--]\s+(DAY|NIGHT|DAWN|DUSK|CONTINUOUS)$",
        "",
        location,
        flags=re.IGNORECASE,
    )

    # Remove details in parentheses
    location = re.sub(r"\s*\([^)]*\)\s*$", "", location)

    # Extract only the first part before the first dash to get the main location
    # e.g., "Asteroid Cave - Millennium Falcon - Cockpit" -> "Asteroid Cave"
    # e.g., "Death Star - Detention Area - Hallway" -> "Death Star"
    if " - " in location or " – " in location:
        location = re.split(r"\s+[-–]\s+", location)[0]

    # Clean up multiple spaces
    location = re.sub(r"\s+", " ", location).strip()

    # Convert to title case
    location = location.title()

    # Fix apostrophe capitalization issue (e.g., "Luke'S" -> "Luke's")
    location = re.sub(r"'S\b", "'s", location)

    # Apply location normalization mapping to merge redundant locations
    if location in LOCATION_NORMALIZATION:
        location = LOCATION_NORMALIZATION[location]

    return location if location else None


# Test location cleaning
print("Testing location name cleaning:")
test_locations = [
    "INT. LUKE'S HOUSE - LIVING ROOM - DAY",
    "EXT. DEATH STAR - OUTER SPACE - CONTINUOUS",
    "INT. MILLENNIUM FALCON - COCKPIT (MOVING) - NIGHT",
    "INT. ASTEROID CAVE - MILLENNIUM FALCON - COCKPIT",
    "INT. DEATH STAR - DETENTION AREA - HALLWAY",
    "INT. VADER'S STAR DESTROYER",
    "INT. DARTH VADER'S STAR DESTROYER",
    "EXT. IMPERIAL STAR DESTROYER",
    "EXT. IMPERIAL STARDESTROYER",
    "INT. LUKE'S X-WING",
    "INT. LUKE'S X-WING FIGHTER",
    "INT. MAIN HANGAR",
    "INT. MAIN HANGAR DECK",
    "1    SPACE",
    "19   HOLDING TUNNEL - RANCOR PIT",
    "3    INT DEATH STAR - CONTROL ROOM",
    "10   EXT TATOOINE - DESERT",
]
for loc in test_locations:
    print(f"  {loc}")
    print(f"    → {clean_location_name(loc)}\n")

Testing location name cleaning:
  INT. LUKE'S HOUSE - LIVING ROOM - DAY
    → Luke's House

  EXT. DEATH STAR - OUTER SPACE - CONTINUOUS
    → Death Star

  INT. MILLENNIUM FALCON - COCKPIT (MOVING) - NIGHT
    → Millennium Falcon

  INT. ASTEROID CAVE - MILLENNIUM FALCON - COCKPIT
    → Asteroid Cave

  INT. DEATH STAR - DETENTION AREA - HALLWAY
    → Death Star

  INT. VADER'S STAR DESTROYER
    → Vader's Star Destroyer

  INT. DARTH VADER'S STAR DESTROYER
    → Vader's Star Destroyer

  EXT. IMPERIAL STAR DESTROYER
    → Imperial Star Destroyer

  EXT. IMPERIAL STARDESTROYER
    → Imperial Star Destroyer

  INT. LUKE'S X-WING
    → Luke's X-Wing

  INT. LUKE'S X-WING FIGHTER
    → Luke's X-Wing

  INT. MAIN HANGAR
    → Main Hangar

  INT. MAIN HANGAR DECK
    → Main Hangar

  1    SPACE
    → Space

  19   HOLDING TUNNEL - RANCOR PIT
    → Holding Tunnel

  3    INT DEATH STAR - CONTROL ROOM
    → Death Star

  10   EXT TATOOINE - DESERT
    → Tatooine



In [21]:
# Process all scenes to build networks
character_connections = defaultdict(lambda: defaultdict(int))
location_characters = defaultdict(set)

total_scenes = 0
skipped_scenes = 0
scenes_for_char_network = 0

for movie_title, scenes in all_scenes.items():
    print(f"\nProcessing {movie_title}...")

    for scene in scenes:
        total_scenes += 1
        location = clean_location_name(scene["location"])

        if not location:
            skipped_scenes += 1
            continue

        # Normalize all character names in this scene
        normalized_chars = set()
        for char in scene["characters"]:
            normalized = normalize_character_name(char)
            if normalized:
                normalized_chars.add(normalized)

        # Skip scenes with no characters
        if len(normalized_chars) == 0:
            skipped_scenes += 1
            continue

        # Add characters to location (even if only 1 character)
        for char in normalized_chars:
            location_characters[location].add(char)

        # Create character-to-character connections only if 2+ characters
        if len(normalized_chars) >= 2:
            scenes_for_char_network += 1
            char_list = list(normalized_chars)
            for i in range(len(char_list)):
                for j in range(i + 1, len(char_list)):
                    char1 = char_list[i]
                    char2 = char_list[j]

                    # Store bidirectional connections
                    character_connections[char1][char2] += 1
                    character_connections[char2][char1] += 1

    print(f"  Processed {len(scenes)} scenes")
    print(
        f"  Found {len([c for s in scenes for c in s['characters']])} character mentions"
    )
    print(
        f"  Unique characters: {len(set([c for s in scenes for c in s['characters']]))}"
    )

print(f"\nTotal scenes extracted: {total_scenes}")
print(f"Scenes with no location or characters (skipped): {skipped_scenes}")
print(f"Scenes used for character network (2+ characters): {scenes_for_char_network}")
print(
    f"Scenes used for location mapping (1+ characters): {total_scenes - skipped_scenes}"
)
print(f"Unique characters in network: {len(character_connections)}")
print(f"Unique locations: {len(location_characters)}")

# Show summary statistics
print("\nTop 5 most connected characters:")
char_degrees = {
    char: len(connections) for char, connections in character_connections.items()
}
for char, degree in sorted(char_degrees.items(), key=lambda x: x[1], reverse=True)[:5]:
    print(f"  {char}: {degree} connections")


Processing A New Hope...
  Processed 479 scenes
  Found 441 character mentions
  Unique characters: 73

Processing The Empire Strikes Back...
  Processed 277 scenes
  Found 313 character mentions
  Unique characters: 52

Processing Return of the Jedi...
  Processed 137 scenes
  Found 213 character mentions
  Unique characters: 44

Total scenes extracted: 893
Scenes with no location or characters (skipped): 388
Scenes used for character network (2+ characters): 258
Scenes used for location mapping (1+ characters): 505
Unique characters in network: 114
Unique locations: 94

Top 5 most connected characters:
  Luke Skywalker: 46 connections
  C-3PO: 45 connections
  Leia Organa: 34 connections
  Darth Vader: 33 connections
  Han Solo: 33 connections


In [22]:
# Convert to regular dicts for JSON serialization
character_connections_dict = {
    char: dict(connections) for char, connections in character_connections.items()
}

location_characters_dict = {
    location: sorted(list(characters))
    for location, characters in location_characters.items()
}

# Save character connections to JSON
char_connections_path = DATA_DIR / "character_connections.json"
with open(char_connections_path, "w") as f:
    json.dump(character_connections_dict, f, indent=2, sort_keys=True)

print(f"Saved character connections to {char_connections_path}")

# Save location-character connections to JSON
location_char_path = DATA_DIR / "location_characters.json"
with open(location_char_path, "w") as f:
    json.dump(location_characters_dict, f, indent=2, sort_keys=True)

print(f"Saved location-character data to {location_char_path}")

# Verify the files were created
print(
    f"\nCharacter connections file size: {char_connections_path.stat().st_size} bytes"
)
print(f"Location-character file size: {location_char_path.stat().st_size} bytes")

# Show sample of character connections
print("\nSample character connections (Luke Skywalker):")
if "Luke Skywalker" in character_connections_dict:
    luke_connections = character_connections_dict["Luke Skywalker"]
    for char, count in sorted(
        luke_connections.items(), key=lambda x: x[1], reverse=True
    )[:5]:
        print(f"  {char}: {count} scenes")
else:
    print("  Luke Skywalker not found in connections")

# Show sample of location data
print("\nSample locations:")
for location in sorted(location_characters_dict.keys())[:5]:
    chars = location_characters_dict[location]
    print(f"  {location}: {len(chars)} characters")

Saved character connections to data/character_connections.json
Saved location-character data to data/location_characters.json

Character connections file size: 16241 bytes
Location-character file size: 9549 bytes

Sample character connections (Luke Skywalker):
  Han Solo: 43 scenes
  C-3PO: 29 scenes
  Leia Organa: 26 scenes
  Obi-Wan Kenobi: 22 scenes
  Darth Vader: 7 scenes

Sample locations:
  Another Cockpit: 2 characters
  Asteroid Cave: 3 characters
  Barge Observation Deck: 1 characters
  Battlefield: 2 characters
  Biggs' Cockpit: 3 characters
