In [140]:
import dspy
import os
from dotenv import load_dotenv

load_dotenv()

# Configure DSPy with Gemini model
api_key = os.getenv("GEMINI_API_KEY", "")

In [141]:
lm = dspy.LM("gemini/gemini-2.5-flash", api_key=api_key, cache=False)
dspy.configure(lm=lm)

In [142]:
class Descriptor(dspy.Signature):
    """
    Generate a detailed, structured description of a Star Wars (we only consider original trilogy) location/scene name using: `location_name`

    Rely on established canon knowledge about original trilogy instead, consider all location within context of original trilogy. 
    Do not invent unsupported facts. Levae output as '' if you cannot find any information.

    Output:
    Summary 3–6 sentences on the location’s role, history, and significance.
    Physical Description: key environmental or architectural traits.
    Narrative Function: how the location is used in stories or lore.
    Atmosphere: emotional tone and typical sentiments linked to the location.

    Go right into summary, no markdown formating, only plain text.
    Do not include "Summary" in the output.
    """

    location_name: str = dspy.InputField(description="The name of the location to describe. This can also be a place or the scene name.")
    description: str = dspy.OutputField(description="A detailed, structured description of the location")

In [143]:
descriptor = dspy.ChainOfThought(Descriptor)

In [144]:
def generate_desc(location_name: str, wiki_page: str):
    return descriptor(location_name=location_name).description

In [145]:
import json

# Load location_characters.json
with open('data/location_characters.json', 'r') as f:
    location_characters = json.load(f)

# Get all location keys
locations = list(location_characters.keys())
print(f"Found {len(locations)} locations to process")
print(f"First few locations: {locations[:5]}")


Found 94 locations to process
First few locations: ['Another Cockpit', 'Asteroid Cave', 'Barge Observation Deck', 'Battlefield', "Biggs' Cockpit"]


In [146]:
# import json
# import re
# import time
# from typing import Any, Dict, List, Optional, Tuple

# import requests

# API = "https://en.wikipedia.org/w/api.php"

# NAMES = [
#     # ... put your list here ...
# ]

# STARWARS_HINTS = [
#     "star wars",
#     "galactic",
#     "jedi",
#     "sith",
#     "luke",
#     "leia",
#     "han solo",
#     "tatooine",
#     "endor",
#     "hoth",
#     "dagobah",
#     "death star",
# ]


# def wiki_get(session: requests.Session, params: Dict[str, Any], timeout_s: int = 30) -> Dict[str, Any]:
#     r = session.get(API, params=params, timeout=timeout_s)
#     r.raise_for_status()
#     return r.json()


# def search(session: requests.Session, srsearch: str, limit: int = 5) -> List[Dict[str, Any]]:
#     data = wiki_get(
#         session,
#         {
#             "action": "query",
#             "list": "search",
#             "srsearch": srsearch,
#             "srlimit": limit,
#             "srnamespace": 0,
#             "format": "json",
#             "formatversion": 2,
#         },
#     )
#     return data.get("query", {}).get("search", []) or []


# def score_hit(name: str, hit: Dict[str, Any]) -> float:
#     """
#     Score a search hit for being Star Wars-specific and matching the name.
#     Uses title + snippet heuristics.
#     """
#     title = (hit.get("title") or "").lower()
#     snippet = re.sub(r"<.*?>", " ", (hit.get("snippet") or "")).lower()
#     text = f"{title} {snippet}"

#     s = 0.0
#     name_l = name.lower()

#     # Name closeness
#     if title == name_l:
#         s += 4.0
#     if name_l in title:
#         s += 2.5
#     if name_l in snippet:
#         s += 1.0

#     # Star Wars signals
#     if "star wars" in text:
#         s += 3.0
#     for kw in STARWARS_HINTS:
#         if kw in text:
#             s += 0.4

#     # Penalize obvious non-Star-Wars ambiguity pages
#     if "(disambiguation)" in title:
#         s -= 2.0

#     # Wikipedia's own search rank (lower index is better) isn't provided directly,
#     # but we can lightly trust 'wordcount' as a weak proxy for "real page".
#     wc = hit.get("wordcount") or 0
#     s += min(1.0, wc / 2000.0)  # cap

#     return s


# def pick_best_starwars_title(session: requests.Session, name: str) -> Optional[str]:
#     # Query variants (most targeted first)
#     queries = [
#         f"\"{name}\" Star Wars",
#         f"intitle:\"{name}\" Star Wars",
#         f"incategory:\"Star Wars\" \"{name}\"",
#         f"incategory:\"Star Wars\" {name}",
#         f"{name} Star Wars",
#     ]

#     best: Tuple[float, Optional[str]] = (-1e9, None)

#     for q in queries:
#         hits = search(session, q, limit=8)
#         for h in hits:
#             sc = score_hit(name, h)
#             if sc > best[0]:
#                 best = (sc, h.get("title"))
#         # early exit if we found a very strong match
#         if best[0] >= 7.0:
#             break

#     return best[1]


# def fetch_page(session: requests.Session, title: str) -> Dict[str, Any]:
#     data = wiki_get(
#         session,
#         {
#             "action": "query",
#             "titles": title,
#             "prop": "extracts|info|categories|pageprops",
#             "exintro": 1,
#             "explaintext": 1,
#             "cllimit": 200,
#             "inprop": "url",
#             "redirects": 1,
#             "format": "json",
#             "formatversion": 2,
#         },
#     )
#     pages = (data.get("query", {}).get("pages") or [])
#     if not pages or not isinstance(pages[0], dict) or pages[0].get("missing"):
#         return {"ok": False, "error": {"code": "missingtitle"}, "requested_title": title}

#     p = pages[0]
#     return {
#         "ok": True,
#         "title": p.get("title"),
#         "pageid": p.get("pageid"),
#         "fullurl": p.get("fullurl"),
#         "extract": p.get("extract") or "",
#         "categories": [c.get("title") for c in (p.get("categories") or []) if isinstance(c, dict)],
#         "pageprops": p.get("pageprops") or {},
#     }


# def run(names: List[str], out_path: str = "wikipedia_starwars_only.json", throttle_s: float = 0.25) -> List[Dict[str, Any]]:
#     session = requests.Session()
#     session.headers.update(
#         {
#             "User-Agent": "StarWarsOnlyWikiFetcher/1.0 (requests; contact: you@example.com)",
#             "Accept": "application/json",
#         }
#     )

#     results: List[Dict[str, Any]] = []
#     for i, name in enumerate(names, 1):
#         chosen = pick_best_starwars_title(session, name)
#         if not chosen:
#             item = {"ok": False, "requested_title": name, "error": {"code": "no_starwars_match"}}
#         else:
#             item = fetch_page(session, chosen)
#             item["requested_title"] = name
#             item["resolved_title"] = chosen

#             # Hard filter: require some Star Wars signal in categories/title/extract
#             hay = (item.get("title", "") + " " + item.get("extract", "") + " " + " ".join(item.get("categories", []))).lower()
#             if "star wars" not in hay and not any(k in hay for k in ("tatooine", "endor", "hoth", "dagobah", "death star")):
#                 item = {
#                     "ok": False,
#                     "requested_title": name,
#                     "resolved_title": chosen,
#                     "error": {"code": "resolved_but_not_starwars_enough"},
#                 }

#         results.append(item)
#         status = "OK" if item.get("ok") else "FAIL"
#         print(f"[{i:>3}/{len(names)}] {status}  {name} -> {item.get('resolved_title') or item.get('error',{}).get('code')}")
#         time.sleep(throttle_s)

#     with open(out_path, "w", encoding="utf-8") as f:
#         json.dump(results, f, ensure_ascii=False, indent=2)

#     print(f"\nSaved to {out_path}")
#     return results


# if __name__ == "__main__":
#     # quick demo; replace with your full list
#     demo = locations
#     run(demo, out_path="wikipedia_starwars_only_demo.json")
# # 

In [147]:
# Load all locations from location_characters.json
with open('data/location_characters.json', 'r') as f:
    location_characters = json.load(f)
all_locations = list(location_characters.keys())
print(f"Total locations from location_characters.json: {len(all_locations)}")

# Load Wikipedia scraping results
with open('wikipedia_starwars_only_demo.json', 'r', encoding='utf-8') as f:
    wiki_data = json.load(f)

# Create a mapping of location names to their wiki extracts
wiki_map = {}
for entry in wiki_data:
    location_name = entry.get('requested_title', '')
    if entry.get('ok') and entry.get('extract'):
        wiki_map[location_name] = entry.get('extract', '')
    else:
        # Mark as failed/missing
        wiki_map[location_name] = None

print(f"Locations with successful wiki data: {sum(1 for v in wiki_map.values() if v is not None)}")
print(f"Locations missing wiki data: {sum(1 for v in wiki_map.values() if v is None)}")

Total locations from location_characters.json: 94
Locations with successful wiki data: 83
Locations missing wiki data: 11


In [148]:
# Create examples for ALL 94 locations
# Use wiki_page if available, otherwise use empty string
examples = []
missing_locations = []

for location_name in all_locations:
    # Get wiki_page from the map, or use empty string if not found
    wiki_page = wiki_map.get(location_name, '')
    if wiki_page is None:
        wiki_page = ''  # Failed scraping, use empty context
    
    # Create a dspy.Example with the inputs matching the Descriptor signature
    example = dspy.Example(
        location_name=location_name,
        wiki_page=wiki_page
    ).with_inputs('location_name', 'wiki_page')
    examples.append(example)
    
    if not wiki_page:
        missing_locations.append(location_name)

print(f"Created {len(examples)} dspy.Example instances (all {len(all_locations)} locations)")
print(f"Locations with wiki context: {len(examples) - len(missing_locations)}")
print(f"Locations with empty context: {len(missing_locations)}")
if missing_locations:
    print(f"\nMissing locations (will use empty context):")
    for loc in missing_locations:
        print(f"  - {loc}")

print(f"\nFirst example:")
print(f"  Location: {examples[0].location_name}")
print(f"  Wiki page length: {len(examples[0].wiki_page)} characters")
if examples[0].wiki_page:
    print(f"  Wiki page preview: {examples[0].wiki_page[:150]}...")
else:
    print(f"  Wiki page: (empty - will rely on LLM knowledge)")


Created 94 dspy.Example instances (all 94 locations)
Locations with wiki context: 83
Locations with empty context: 11

Missing locations (will use empty context):
  - Barge Observation Deck
  - Battlefield
  - Bunker
  - Dungeon Corridor
  - Ewok Village Square
  - Ext/Int
  - Forest Clearing
  - Gantry
  - Ridge
  - Scout Campsite
  - Zev's Snowspeeder, Rogue Two

First example:
  Location: Another Cockpit
  Wiki page length: 870 characters
  Wiki page preview: Star Wars: Squadrons is a space combat game set in the Star Wars universe, developed by Motive Studio and published by Electronic Arts. It was release...


In [149]:
from dspy import Parallel

def call_generate_desc(**kwargs):
    """Wrapper to call generate_desc - accepts keyword arguments from Parallel"""
    return generate_desc(
        location_name=kwargs.get('location_name', ''),
        wiki_page=kwargs.get('wiki_page', '')
    )


In [150]:
exec_pairs = [
    (call_generate_desc, example.inputs()) 
    for example in examples
]

# Create Parallel executor
parallel_executor = Parallel(
    num_threads=100,
    max_errors=None,
    access_examples=True,
    return_failed_examples=False,
    provide_traceback=True,
    disable_progress_bar=False
)


In [151]:
parallel_results = parallel_executor(exec_pairs)

Processed 94 / 94 examples: 100%|██████████| 94/94 [00:22<00:00,  4.19it/s]


In [152]:
output_data = []
for i, (example, result) in enumerate(zip(examples, parallel_results)):
    output_data.append({
        'location_name': example.location_name,
        'description': result # result is the description string from generate_desc
    })

output_file = 'data/location_descriptions_llm_gemini_2_5_flash.json'
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(output_data, f, indent=2, ensure_ascii=False)

print(f"Saved {len(output_data)} location descriptions to {output_file}")

Saved 94 location descriptions to data/location_descriptions_llm_gemini_2_5_flash.json


In [153]:
def price_token_cost(lm=lm):
    cost = sum(x['cost'] for x in lm.history if x.get('cost') is not None)
    total_tokens_used = sum(
        x['usage']['total_tokens']
        for x in lm.history
        if x.get('usage') and x['usage'].get('total_tokens') is not None
    )
    print(f"Total tokens used: {total_tokens_used}")
    print(f"Total cost: ${cost:.6f}")

In [154]:
price_token_cost()

Total tokens used: 134205
Total cost: $0.258046
