In [1]:
SW_a_new_hope_script_url = "https://imsdb.com/scripts/Star-Wars-A-New-Hope.html"
SW_empire_strikes_back_script_url = (
    "https://imsdb.com/scripts/Star-Wars-The-Empire-Strikes-Back.html"
)
SW_return_of_the_jedi_script_url = (
    "https://imsdb.com/scripts/Star-Wars-Return-of-the-Jedi.html"
)

In [2]:
import requests
from bs4 import BeautifulSoup
import re
import json
from collections import defaultdict
from pathlib import Path
import pandas as pd

# Set up data directory
DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)

In [4]:
def fetch_script(url):
    """Fetch and parse a Star Wars script from imsdb.com"""
    print(f"Fetching: {url}")
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
    }
    response = requests.get(url, headers=headers)
    response.encoding = "utf-8"
    return response.text


def extract_scenes_and_characters(html_content):
    """Extract scene headings and character names from script HTML"""
    soup = BeautifulSoup(html_content, "html.parser")

    # Find the pre tag containing the script
    pre_tag = soup.find("pre")
    if not pre_tag:
        print("Warning: Could not find pre tag in HTML")
        return []

    script_text = pre_tag.get_text()
    lines = script_text.split("\n")

    scenes = []
    current_scene = {"location": None, "characters": set(), "content": []}

    for line in lines:
        line_stripped = line.strip()

        # Detect scene headings (INT./EXT. patterns)
        if re.match(r"^(INT|EXT)\.\s+", line_stripped):
            if current_scene["characters"]:
                scenes.append(current_scene)
            current_scene = {
                "location": line_stripped,
                "characters": set(),
                "content": [],
            }

        # Detect character names (all caps on their own line or before dialogue)
        elif (
            re.match(r"^[A-Z][A-Z\s\-\']*$", line_stripped)
            and len(line_stripped) > 1
            and len(line_stripped) < 50
        ):
            # This is likely a character name
            char_name = line_stripped.strip()
            if char_name and char_name not in [
                "FADE IN:",
                "FADE OUT:",
                "THE END",
                "CREDITS",
                "TO BE CONTINUED",
            ]:
                current_scene["characters"].add(char_name)
                current_scene["content"].append(("CHARACTER", char_name))
        else:
            if line_stripped:
                current_scene["content"].append(("DIALOGUE", line_stripped))

    # Don't forget the last scene
    if current_scene["characters"]:
        scenes.append(current_scene)

    return scenes


# Fetch all three scripts
scripts = {}
scripts["A New Hope"] = fetch_script(SW_a_new_hope_script_url)
scripts["The Empire Strikes Back"] = fetch_script(SW_empire_strikes_back_script_url)
scripts["Return of the Jedi"] = fetch_script(SW_return_of_the_jedi_script_url)

print("Scripts fetched successfully!")

# Extract scenes from each script
all_scenes = {}
for title, html in scripts.items():
    print(f"\nParsing {title}...")
    scenes = extract_scenes_and_characters(html)
    all_scenes[title] = scenes
    print(f"  Found {len(scenes)} scenes with character interactions")

    # Show sample
    if scenes:
        sample = scenes[0]
        print(f"  Sample scene: {sample['location']}")
        print(f"  Characters: {sample['characters']}")

Fetching: https://imsdb.com/scripts/Star-Wars-A-New-Hope.html
Fetching: https://imsdb.com/scripts/Star-Wars-The-Empire-Strikes-Back.html
Fetching: https://imsdb.com/scripts/Star-Wars-Return-of-the-Jedi.html
Scripts fetched successfully!

Parsing A New Hope...
  Found 259 scenes with character interactions
  Sample scene: None
  Characters: {'JOURNAL OF THE WHILLS', 'STAR WARS', 'A NEW HOPE'}

Parsing The Empire Strikes Back...
  Found 158 scenes with character interactions
  Sample scene: None
  Characters: {'THE EMPIRE STRIKES BACK'}

Parsing Return of the Jedi...
  Found 1 scenes with character interactions
  Sample scene: None
  Characters: {'MON MOTHMA', 'OPERATOR', 'Y-WING PILOT', 'YODA', 'JEDI', 'LEIA', 'ACKBAR', 'RED THREE', 'PILOT', 'LURE', 'NINEDENINE', 'RED TWO', 'CONTROL ROOM COMMANDER', 'BEN', 'SECOND COMMANDER', 'LUKE', 'PIETT', 'RETURN', 'JERJERROD', 'GREEN LEADER', 'GUARD', 'STORMTROOPER', 'ANAKIN', 'OOLA', 'EMPEROR', 'THREEPIO', 'BIB', 'HAN', 'NAVIGATOR', 'GRAY LEADER',

In [5]:
# Character name normalization mapping
# Maps all variations of character names to canonical names
CHARACTER_NORMALIZATION = {
    # Luke
    "LUKE": "Luke Skywalker",
    "LUKE SKYWALKER": "Luke Skywalker",
    "LUKE'S VOICE": "Luke Skywalker",
    # Han Solo
    "HAN": "Han Solo",
    "HAN SOLO": "Han Solo",
    "HAN'S VOICE": "Han Solo",
    # Leia
    "LEIA": "Leia Organa",
    "LEIA ORGANA": "Leia Organa",
    "LEIA'S VOICE": "Leia Organa",
    "PRINCESS LEIA": "Leia Organa",
    # Obi-Wan
    "OBI-WAN": "Obi-Wan Kenobi",
    "OBI WAN": "Obi-Wan Kenobi",
    "OBI-WAN KENOBI": "Obi-Wan Kenobi",
    "BEN": "Obi-Wan Kenobi",
    "OLD BEN": "Obi-Wan Kenobi",
    # Yoda
    "YODA": "Yoda",
    "YODA'S VOICE": "Yoda",
    # Darth Vader
    "VADER": "Darth Vader",
    "DARTH VADER": "Darth Vader",
    "LORD VADER": "Darth Vader",
    "VADER'S VOICE": "Darth Vader",
    # Emperor Palpatine
    "EMPEROR": "Emperor Palpatine",
    "PALPATINE": "Emperor Palpatine",
    "EMPEROR PALPATINE": "Emperor Palpatine",
    "THE EMPEROR": "Emperor Palpatine",
    "EMPEROR'S VOICE": "Emperor Palpatine",
    # Lando
    "LANDO": "Lando Calrissian",
    "LANDO CALRISSIAN": "Lando Calrissian",
    # Chewbacca
    "CHEWBACCA": "Chewbacca",
    "CHEWIE": "Chewbacca",
    "CHEW": "Chewbacca",
    # C-3PO and R2-D2
    "C-3PO": "C-3PO",
    "C3PO": "C-3PO",
    "THREEPIO": "C-3PO",
    "THREE-PO": "C-3PO",
    "R2-D2": "R2-D2",
    "R2D2": "R2-D2",
    "ARTOO": "R2-D2",
    # Other characters
    "LEIA'S HANDMAIDEN": "Leia's Handmaiden",
    "REBEL GENERAL": "Rebel General",
    "IMPERIAL OFFICER": "Imperial Officer",
    "STORMTROOPER": "Stormtrooper",
    "GUARD": "Guard",
    "OFFICER": "Officer",
    "PILOT": "Pilot",
    "REBEL PILOT": "Rebel Pilot",
    "X-WING PILOT": "X-Wing Pilot",
    # Other named characters
    "WEDGE": "Wedge Antilles",
    "WEDGE ANTILLES": "Wedge Antilles",
    "UNCLE OWEN": "Owen Lars",
    "OWEN": "Owen Lars",
    "AUNT BERU": "Beru Lars",
    "BERU": "Beru Lars",
    "BIGGS": "Biggs Darklighter",
    "BIGGS DARKLIGHTER": "Biggs Darklighter",
    "TARKIN": "Grand Moff Tarkin",
    "GRAND MOFF TARKIN": "Grand Moff Tarkin",
    "MOFF TARKIN": "Grand Moff Tarkin",
    "LEA": "Leia Organa",
    "SCENE": "Scene",
    "VOICE": "Voice",
}


def normalize_character_name(name):
    """Normalize a character name to canonical form"""
    if not name or not isinstance(name, str):
        return None

    name_upper = name.upper().strip()

    # Remove common suffixes
    name_upper = re.sub(r"\s*\(V\.O\.\)\s*$", "", name_upper)
    name_upper = re.sub(r"\s*\(CONT\'D\)\s*$", "", name_upper)
    name_upper = re.sub(r"\s*O\.S\.\s*$", "", name_upper)

    if name_upper in CHARACTER_NORMALIZATION:
        return CHARACTER_NORMALIZATION[name_upper]

    # If not found, clean up the name and return it
    if (
        name_upper
        and len(name_upper) > 2
        and name_upper not in ["THE", "AND", "OR", "TO"]
    ):
        return name_upper.title()

    return None


# Test the normalization
print("Testing character name normalization:")
test_names = ["LUKE", "Han", "leia", "OBI-WAN KENOBI", "VADER", "C-3PO"]
for name in test_names:
    print(f"  {name} → {normalize_character_name(name)}")

Testing character name normalization:
  LUKE → Luke Skywalker
  Han → Han Solo
  leia → Leia Organa
  OBI-WAN KENOBI → Obi-Wan Kenobi
  VADER → Darth Vader
  C-3PO → C-3PO


In [6]:
def clean_location_name(location):
    """Clean and normalize location names from scene headings"""
    if not location:
        return None

    # Remove INT/EXT prefix and time of day
    location = re.sub(r"^(INT|EXT)\.\s+", "", location).strip()
    location = re.sub(
        r"\s+[-–]\s+(DAY|NIGHT|DAWN|DUSK|CONTINUOUS)$",
        "",
        location,
        flags=re.IGNORECASE,
    )

    # Remove details in parentheses
    location = re.sub(r"\s*\([^)]*\)\s*$", "", location)

    # Clean up multiple dashes and spaces
    location = re.sub(r"\s+[-–]\s+", " - ", location)
    location = re.sub(r"\s+", " ", location).strip()

    # Convert to title case
    location = location.title()

    return location if location else None


# Test location cleaning
print("Testing location name cleaning:")
test_locations = [
    "INT. LUKE'S HOUSE - LIVING ROOM - DAY",
    "EXT. DEATH STAR - OUTER SPACE - CONTINUOUS",
    "INT. MILLENNIUM FALCON - COCKPIT (MOVING) - NIGHT",
]
for loc in test_locations:
    print(f"  {loc}")
    print(f"    → {clean_location_name(loc)}\n")

Testing location name cleaning:
  INT. LUKE'S HOUSE - LIVING ROOM - DAY
    → Luke'S House - Living Room

  EXT. DEATH STAR - OUTER SPACE - CONTINUOUS
    → Death Star - Outer Space

  INT. MILLENNIUM FALCON - COCKPIT (MOVING) - NIGHT
    → Millennium Falcon - Cockpit



In [7]:
# Process all scenes to build networks
character_connections = defaultdict(lambda: defaultdict(int))
location_characters = defaultdict(set)

total_scenes = 0
skipped_scenes = 0

for movie_title, scenes in all_scenes.items():
    print(f"\nProcessing {movie_title}...")

    for scene in scenes:
        total_scenes += 1
        location = clean_location_name(scene["location"])

        if not location:
            skipped_scenes += 1
            continue

        # Normalize all character names in this scene
        normalized_chars = set()
        for char in scene["characters"]:
            normalized = normalize_character_name(char)
            if normalized:
                normalized_chars.add(normalized)

        if len(normalized_chars) < 2:
            skipped_scenes += 1
            continue

        # Add characters to location
        for char in normalized_chars:
            location_characters[location].add(char)

        # Create connections between all pairs of characters in the scene
        char_list = list(normalized_chars)
        for i in range(len(char_list)):
            for j in range(i + 1, len(char_list)):
                char1 = char_list[i]
                char2 = char_list[j]

                # Store bidirectional connections
                character_connections[char1][char2] += 1
                character_connections[char2][char1] += 1

    print(f"  Processed {len(scenes)} scenes")
    print(
        f"  Found {len([c for s in scenes for c in s['characters']])} character mentions"
    )
    print(
        f"  Unique characters: {len(set([c for s in scenes for c in s['characters']]))}"
    )

print(f"\nTotal scenes: {total_scenes}")
print(f"Skipped scenes: {skipped_scenes}")
print(f"Unique characters in network: {len(character_connections)}")
print(f"Unique locations: {len(location_characters)}")

# Show summary statistics
print("\nTop 5 most connected characters:")
char_degrees = {
    char: len(connections) for char, connections in character_connections.items()
}
for char, degree in sorted(char_degrees.items(), key=lambda x: x[1], reverse=True)[:5]:
    print(f"  {char}: {degree} connections")


Processing A New Hope...
  Processed 259 scenes
  Found 444 character mentions
  Unique characters: 76

Processing The Empire Strikes Back...
  Processed 158 scenes
  Found 314 character mentions
  Unique characters: 53

Processing Return of the Jedi...
  Processed 1 scenes
  Found 48 character mentions
  Unique characters: 48

Total scenes: 418
Skipped scenes: 224
Unique characters in network: 94
Unique locations: 123

Top 5 most connected characters:
  Luke Skywalker: 40 connections
  C-3PO: 34 connections
  Darth Vader: 30 connections
  Leia Organa: 28 connections
  Han Solo: 27 connections


In [8]:
# Convert to regular dicts for JSON serialization
character_connections_dict = {
    char: dict(connections) for char, connections in character_connections.items()
}

location_characters_dict = {
    location: sorted(list(characters))
    for location, characters in location_characters.items()
}

# Save character connections to JSON
char_connections_path = DATA_DIR / "character_connections.json"
with open(char_connections_path, "w") as f:
    json.dump(character_connections_dict, f, indent=2, sort_keys=True)

print(f"Saved character connections to {char_connections_path}")

# Save location-character connections to JSON
location_char_path = DATA_DIR / "location_characters.json"
with open(location_char_path, "w") as f:
    json.dump(location_characters_dict, f, indent=2, sort_keys=True)

print(f"Saved location-character data to {location_char_path}")

# Verify the files were created
print(
    f"\nCharacter connections file size: {char_connections_path.stat().st_size} bytes"
)
print(f"Location-character file size: {location_char_path.stat().st_size} bytes")

# Show sample of character connections
print("\nSample character connections (Luke Skywalker):")
if "Luke Skywalker" in character_connections_dict:
    luke_connections = character_connections_dict["Luke Skywalker"]
    for char, count in sorted(
        luke_connections.items(), key=lambda x: x[1], reverse=True
    )[:5]:
        print(f"  {char}: {count} scenes")
else:
    print("  Luke Skywalker not found in connections")

# Show sample of location data
print("\nSample locations:")
for location in sorted(location_characters_dict.keys())[:5]:
    chars = location_characters_dict[location]
    print(f"  {location}: {len(chars)} characters")

Saved character connections to data/character_connections.json
Saved location-character data to data/location_characters.json

Character connections file size: 12672 bytes
Location-character file size: 12504 bytes

Sample character connections (Luke Skywalker):
  Han Solo: 30 scenes
  Obi-Wan Kenobi: 21 scenes
  C-3PO: 19 scenes
  Leia Organa: 18 scenes
  Yoda: 5 scenes

Sample locations:
  Asteroid Cave - Millennium Falcon: 3 characters
  Asteroid Cave - Millennium Falcon - Cockpit: 3 characters
  Asteroid Cave - Millennium Falcon - Entry Area: 2 characters
  Asteroid Cave - Millennium Falcon - Hold Area: 3 characters
  Battlefield - Snow Trench: 2 characters
