In [None]:
# imports and environment setup
import os  # standard library: read environment variables for keys in a secure, cross-platform way
from google import genai  # official Google Gen AI SDK entrypoint [1]
from google.genai import types  # typed request/response helpers, useful later for structured output [3]

os.environ["GOOGLE_API_KEY"] = ""
# Read the API key from environment for safety (never hardcode secrets in notebooks or repos).
# The SDK auto-detects GEMINI_API_KEY or GOOGLE_API_KEY; GOOGLE_API_KEY takes precedence if both exist [4].
api_key = os.environ.get("GOOGLE_API_KEY") or os.environ.get("GEMINI_API_KEY")

# Fail fast with a clear message if no key is present, and point the user at the official place to create one.
if not api_key:
    raise RuntimeError(
        "Missing API key. Set GOOGLE_API_KEY (preferred) or GEMINI_API_KEY in your environment. "
        "Create a key in Google AI Studio, then restart this cell. See refs [1][4]."
    )

# Optional: choose which platform to use underneath.
# By default, the client talks to the Gemini Developer API using the key above.
# If you want to target Vertex AI later, you can set env vars before creating the client (commented out for now) [5][6]:
# os.environ["GOOGLE_GENAI_USE_VERTEXAI"] = "True"     # tell the SDK to route via Vertex AI
# os.environ["GOOGLE_CLOUD_PROJECT"] = "<your-gcp-project-id>"  # needed for Vertex AI routing
# os.environ["GOOGLE_CLOUD_LOCATION"] = "us-central1"           # or "europe-west1", etc.

# Create a synchronous client. Passing api_key explicitly makes the setup unambiguous and notebook-friendly [1].
client = genai.Client(api_key=api_key)

# Light connectivity smoke test: fetch a few models so we know the key works and the network path is fine [7].
# We limit to a handful to keep output readable in a notebook.
# models_iter = client.models.list()   returns a pager-like iterable of available base models  

model_name = "gemini-2.5-flash"

In [19]:
# We keep the prompt trivial and deterministic so the test is stable in CI and notebooks.
test_prompt = "Return a single short sentence confirming the client is working."

# You can pass a GenerateContentConfig with sampling params, but defaults are fine for this ping.
# We explicitly set a low token cap and temperature to keep output short and predictable in a POC.
response = client.models.generate_content(
    model=model_name,  # the model chosen above
    contents=test_prompt,  # plain text input for a simple first call
    config=types.GenerateContentConfig(  # typed config keeps things explicit and discoverable [3]
        max_output_tokens=64,  # small cap to avoid noisy output in smoke tests
        temperature=0.2,       # slightly stochastic but near-deterministic for repeatability
    ),
)

# The high-level text helper returns the aggregated text of the top candidate.
print("Model replied:", response.text)

Model replied: The client is operational.


In [20]:
# ULID gives time-sortable unique IDs; great for append-only logs and human-friendly debugging.
# We pin a lightweight lib so we don't depend on Python's stdlib UUID versions across machines.
# We add python-slugify to build stable, human-readable candidate IDs from tool names.
# !uv pip install ulid-py python-slugify

In [26]:
import os  # filesystem ops and env access
import json  # serialize structured outputs to disk as JSON and JSONL
from datetime import datetime, timezone  # ISO 8601 timestamps in UTC for consistent logs
import re  # light sanitization of domains if the model returns noisy text
import ulid  # time-sortable unique IDs (monotonic-ish per millisecond)
from slugify import slugify  # normalize tool names into URL-safe slugs
import hashlib  # short deterministic hash to stabilize IDs across machines
from typing import TypedDict, NotRequired, Literal  # simple runtime-free schema typing

In [27]:
# Define a stable, extensible record envelope. This future-proofs the append-only store.
class IntentRecord(TypedDict):
    id: str                                        # ULID string; unique and time-sortable
    kind: Literal["capture-intent"]                # record type tag for easy querying later
    version: int                                   # schema version for evolvability
    created_at: str                                # RFC3339/ISO8601 UTC timestamp
    source: Literal["user-input+llm"]              # provenance tag to aid observability
    raw_input: str                                 # original user text, verbatim for traceability
    data: dict                                     # arbitrarily nested payload; schema below is a guideline
    meta: NotRequired[dict]                        # optional bag for deploy, notebook, or operator notes

# Define the structured payload for the current step.
class ToolCandidate(TypedDict):
    tool_name: str                                 # canonical tool name, e.g., "Salesforce"
    developer: NotRequired[str]                    # org or vendor, e.g., "Salesforce, Inc."
    website_domain: NotRequired[str]               # bare domain, e.g., "salesforce.com"
    website_url: NotRequired[str]                  # full URL, e.g., "https://www.salesforce.com"
    logo_url: NotRequired[str]                     # logo URL; see comment on Clearbit below
    confidence: NotRequired[float]                 # model-reported confidence between 0 and 1
    notes: NotRequired[str]                        # brief disambiguation notes for UX

class ToolResolution(TypedDict):
    # The model can propose multiple candidates; we keep them all for auditability.
    candidates: list[ToolCandidate]                # ordered best-first by the model
    selected_index: int                            # index into candidates that we will persist as "current pick"
    disambiguation: NotRequired[str]               # short text to show the user when there’s ambiguity
    citations: NotRequired[list[str]]              # optional URLs the model relied on (best-effort)

In [23]:
# read user input for the POC run
# In your UI this would come from the single-paragraph capture screen. Here we keep it interactive for now.
user_text = input("Describe the data tool you want (e.g., 'I need Salesforce data.'): ").strip()  # simple CLI input

# Fail early if nothing was provided, so we don’t write empty records.
if not user_text:
    raise ValueError("Please provide a short description of the tool you want.")

In [28]:
# LLM call to infer tool identity and normalize fields
# We ask Gemini to return strict JSON that matches our schema, so later steps can be purely mechanical.
# We do not fetch logos from the web here. Instead we heuristically set logo_url based on domain using
# the de facto Clearbit Logo pattern: https://logo.clearbit.com/<domain> which serves favicons/logos without auth.
# This is good enough for a local POC and can be swapped for your own asset pipeline later.

# Build a compact, instruction-only prompt to minimize model verbosity and reduce parse risk.
prompt = f"""
You are a resolver that identifies a software/data tool from noisy user text.

USER_TEXT: {user_text}

Produce JSON with this structure:
{{
  "candidates": [
    {{
      "tool_name": "<canonical tool name>",
      "developer": "<vendor or developer, if known>",
      "website_domain": "<registrable domain like salesforce.com, if known>",
      "website_url": "<full homepage URL, if known>",
      "logo_url": "",  // leave empty; caller may set Clearbit-style logo from domain
      "confidence": 0.0,  // 0 to 1
      "notes": "<one-line disambiguation or clarifying note>"
    }}
  ],
  "selected_index": 0,
  "disambiguation": "<one short sentence if multiple tools are plausible>",
  "citations": []  // optional URLs if you used known references
}}

Rules:
- If multiple tools match, include the top 2-3 candidates in descending confidence and set selected_index accordingly.
- Prefer official vendor domains, not community links.
- If unsure, still return best-effort candidates and explain uncertainty in 'disambiguation'.
- Return only JSON. No extra text.
"""

# Ask the model for strictly-typed JSON using the SDK’s JSON mode.
resp = client.models.generate_content(  # high-level text generation entry point from the google-genai SDK
    model=model_name,                   # previously selected model, e.g., "gemini-2.5-flash" (fast and cheap)
    contents=prompt,                    # instruction block above
    config=types.GenerateContentConfig( # typed config makes request explicit and helps future readers
        response_mime_type="application/json",  # request JSON so we can parse programmatically
        response_schema=ToolResolution,         # provide a Python TypedDict to enforce shape client-side
        temperature=0,                          # deterministic output reduces flakiness in stateful flows
        max_output_tokens=512,                  # enough headroom for a few candidates without overrun
    ),
)

# Pull the parsed Python object directly. The SDK maps the JSON into native types when a schema is given.
tool_resolution = resp.parsed  # type: ignore[assignment]  # parsed is a dict matching ToolResolution

# Display raw JSON for inspection during POC runs.
print(json.dumps(tool_resolution, indent=2))

{
  "candidates": [
    {
      "tool_name": "Facebook",
      "developer": "Meta Platforms, Inc.",
      "website_domain": "facebook.com",
      "website_url": "https://www.facebook.com",
      "logo_url": "",
      "confidence": 1.0,
      "notes": "Social media platform"
    }
  ],
  "selected_index": 0
}


In [29]:
# post-process: sanitize domains, populate logo URL, pick selected candidate
# Create a small helper lambda inline (not a def) to keep the "no functions" vibe while staying readable.
sanitize_domain = lambda d: re.sub(r"^https?://", "", d or "").split("/")[0].lower() if d else ""  # strip scheme and paths

# Normalize each candidate
normalized_candidates = []  # we’ll fill this with cleaned copies
for cand in tool_resolution.get("candidates", []):  # iterate proposed candidates in given order
    domain = cand.get("website_domain") or sanitize_domain(cand.get("website_url", ""))  # prefer explicit domain
    domain = sanitize_domain(domain)  # ensure bare domain shape
    logo_url = cand.get("logo_url") or (f"https://logo.clearbit.com/{domain}" if domain else "")  # Clearbit heuristic for POC
    tool_name = cand.get("tool_name", "").strip()  # canonical name
    developer = (cand.get("developer") or "").strip()  # vendor optional
    website_url = cand.get("website_url") or (f"https://{domain}" if domain else "")  # synthesize URL if domain exists
    confidence = float(cand.get("confidence", 0.0))  # numeric
    notes = (cand.get("notes") or "").strip()  # one-liner aid
    
    # New: deterministic candidateId while keeping everything else the same
    name_slug = slugify(tool_name) or "unknown"  # readable component
    hash_input = f"{tool_name}|{domain}".encode("utf-8")  # bind identity to canonical name+domain
    short_hash = hashlib.sha256(hash_input).hexdigest()[:12]  # compact, collision-resistant tail
    candidate_id = f"{name_slug}-{short_hash}"  # final candidateId
   
    # Compose normalized candidate WITH candidateId added
    normalized = {
        "candidateId": candidate_id,                    # <= new stable identifier
        "tool_name": tool_name,                         # trim whitespace noise
        "developer": developer,                         # optional field
        "website_domain": domain,                        # normalized bare domain
        "website_url": website_url,                     # synthesize URL if missing
        "logo_url": logo_url,                           # derived if we have a domain
        "confidence": confidence,                       # ensure numeric type
        "notes": notes,                                 # keep short helper text
    }

    normalized_candidates.append(normalized)  # collect normalized candidate

# Fall back to a single unknown candidate if the model returned nothing for resilience.
if not normalized_candidates:  # guardrail for empty model output
    normalized_candidates = [{
        "candidateId": "unknown-000000000000",  # placeholder to keep shape stable
        "tool_name": "Unknown",
        "developer": "",
        "website_domain": "",
        "website_url": "",
        "logo_url": "",
        "confidence": 0.0,
        "notes": "No candidates returned by model",
    }]

# Clamp selected_index into range to avoid index errors if the model provided an out-of-range value.
selected_index = int(tool_resolution.get("selected_index", 0))  # parse to int
if selected_index < 0 or selected_index >= len(normalized_candidates):  # simple bounds check
    selected_index = 0  # default to first candidate

# Compose the final structured payload that we will store.
capture_payload = {
    "candidates": normalized_candidates,                        # normalized list
    "selected_index": selected_index,                           # safe index
    "disambiguation": tool_resolution.get("disambiguation", ""),# carry through for the UI
    "citations": tool_resolution.get("citations", []),          # optional links for traceability
}

# Pretty-print what we’ll show to the user next (name, developer, domain, logo).
selected = normalized_candidates[selected_index]  # the current pick
print("Selected tool for confirmation:")
print(f"- Name: {selected['tool_name']}")
print(f"- Developer: {selected['developer'] or 'Unknown'}")
print(f"- Domain: {selected['website_domain'] or 'Unknown'}")
print(f"- Logo: {selected['logo_url'] or 'Not available'}")


Selected tool for confirmation:
- Name: Facebook
- Developer: Meta Platforms, Inc.
- Domain: facebook.com
- Logo: https://logo.clearbit.com/facebook.com


In [None]:
# persist to local state: create folder, write one-per-record JSON, append to JSONL index
# Use an append-only JSON Lines file for an audit trail, plus per-record JSON files for random access.

# Decide storage locations relative to the current notebook working directory.
base_dir = "capture-intent"  # project-level folder for this stage
os.makedirs(base_dir, exist_ok=True)  # idempotent create so reruns don’t fail

# Generate a time-sortable unique ID. ULID keeps embedded timestamp which helps with chronological listings.
record_id = str(ulid.new())  # e.g., "01J6YP4ZQF2F7PAZ7QJX4P1C6H"

# Compose the record envelope with schema versioning for long-term evolvability.
record: IntentRecord = {
    "id": record_id,                                 # unique identifier for this capture
    "kind": "capture-intent",                        # record type tag
    "version": 1,                                    # bump when you change the shape in breaking ways
    "created_at": datetime.now(timezone.utc).isoformat(),  # precise UTC timestamp with tzinfo
    "source": "user-input+llm",                      # basic provenance for observability
    "raw_input": user_text,                          # store the exact user words
    "data": capture_payload,                         # normalized structured content from the model
    "meta": {                                        # optional metadata bag for this notebook run
        "model": model_name,                         # which base model produced this capture
        "sdk": "google-genai",                       # which client stack we used
        "notebook": "capture_intent_poc_v1",         # identify this workflow version
    },
}

# Write a per-record JSON file named by ID for random access and debugging.
per_record_path = os.path.join(base_dir, f"{record_id}.json")  # one file per record
with open(per_record_path, "w", encoding="utf-8") as f:        # open the path for writing text
    json.dump(record, f, ensure_ascii=False, indent=2)          # pretty JSON for human inspection

# Also append to a project-level JSONL index for streaming analytics or simple grepping.
index_path = os.path.join(base_dir, "index.jsonl")             # append-only ledger
with open(index_path, "a", encoding="utf-8") as idx:           # open file in append mode
    idx.write(json.dumps(record, ensure_ascii=False) + "\n")    # write compact JSON on a single line

print(f"Wrote record {record_id} to:")
print(f" - {per_record_path}")
print(f" - {index_path} (appended)")


Wrote record 01K2ZFFBMHESTG7Q8CZQ54B966 to:
 - capture-intent/01K2ZFFBMHESTG7Q8CZQ54B966.json
 - capture-intent/index.jsonl (appended)


In [17]:
# minimal UX echo suitable for your front end to render
# This is intentionally tiny: your UI can lift these fields directly for the confirmation pane.

display_payload = {
    "id": record["id"],                               # send the ID to the front end for follow-up actions
    "tool_name": selected["tool_name"],               # primary label
    "developer": selected["developer"],               # secondary label
    "domain": selected["website_domain"],             # used for linking and further discovery
    "logo": selected["logo_url"],                     # image src candidate for the UI
    "disambiguation": record["data"].get("disambiguation", ""),  # optional helper text
}

print(json.dumps(display_payload, indent=2))  # easy to pick up by the front end


{
  "id": "01K2ZFFBMHESTG7Q8CZQ54B966",
  "tool_name": "WhatsApp",
  "developer": "Meta Platforms",
  "domain": "whatsapp.com",
  "logo": "https://logo.clearbit.com/whatsapp.com",
  "disambiguation": ""
}


In [None]:
# imports and environment setup
import os  # standard library: read environment variables for keys in a secure, cross-platform way
from google import genai  # official Google Gen AI SDK entrypoint [1]
from google.genai import types  # typed request/response helpers, useful later for structured output [3]

os.environ["GOOGLE_API_KEY"] = ""
# Read the API key from environment for safety (never hardcode secrets in notebooks or repos).
# The SDK auto-detects GEMINI_API_KEY or GOOGLE_API_KEY; GOOGLE_API_KEY takes precedence if both exist [4].
api_key = os.environ.get("GOOGLE_API_KEY") or os.environ.get("GEMINI_API_KEY")

# Fail fast with a clear message if no key is present, and point the user at the official place to create one.
if not api_key:
    raise RuntimeError(
        "Missing API key. Set GOOGLE_API_KEY (preferred) or GEMINI_API_KEY in your environment. "
        "Create a key in Google AI Studio, then restart this cell. See refs [1][4]."
    )

# Optional: choose which platform to use underneath.
# By default, the client talks to the Gemini Developer API using the key above.
# If you want to target Vertex AI later, you can set env vars before creating the client (commented out for now) [5][6]:
# os.environ["GOOGLE_GENAI_USE_VERTEXAI"] = "True"     # tell the SDK to route via Vertex AI
# os.environ["GOOGLE_CLOUD_PROJECT"] = "<your-gcp-project-id>"  # needed for Vertex AI routing
# os.environ["GOOGLE_CLOUD_LOCATION"] = "us-central1"           # or "europe-west1", etc.

# Create a synchronous client. Passing api_key explicitly makes the setup unambiguous and notebook-friendly [1].
client = genai.Client(api_key=api_key)

# Light connectivity smoke test: fetch a few models so we know the key works and the network path is fine [7].
# We limit to a handful to keep output readable in a notebook.
# models_iter = client.models.list()   returns a pager-like iterable of available base models  

model_name = "gemini-2.5-flash"
print("Setup complete - Gemini client initialized")

Setup complete - Gemini client initialized


In [31]:
# We keep the prompt trivial and deterministic so the test is stable in CI and notebooks.
test_prompt = "Return a single short sentence confirming the client is working."

# You can pass a GenerateContentConfig with sampling params, but defaults are fine for this ping.
# We explicitly set a low token cap and temperature to keep output short and predictable in a POC.
response = client.models.generate_content(
    model=model_name,  # the model chosen above
    contents=test_prompt,  # plain text input for a simple first call
    config=types.GenerateContentConfig(  # typed config keeps things explicit and discoverable [3]
        max_output_tokens=64,  # small cap to avoid noisy output in smoke tests
        temperature=0.2,       # slightly stochastic but near-deterministic for repeatability
    ),
)

# The high-level text helper returns the aggregated text of the top candidate.
print("Model replied:", response.text)

Model replied: Client is operational.


In [32]:
import os  # filesystem ops and env access
import json  # serialize structured outputs to disk as JSON and JSONL
import hashlib  # for generating stable candidate IDs
from datetime import datetime, timezone  # ISO 8601 timestamps in UTC for consistent logs
import re  # light sanitization of domains if the model returns noisy text
import ulid  # time-sortable unique IDs (monotonic-ish per millisecond)
from slugify import slugify  # normalize tool names into URL-safe slugs
from typing import TypedDict, NotRequired, Literal  # simple runtime-free schema typing

print("Dependencies imported successfully")

Dependencies imported successfully


In [33]:
# Define a stable, extensible record envelope. This future-proofs the append-only store.
class IntentRecord(TypedDict):
    id: str                                        # ULID string; unique and time-sortable
    kind: Literal["capture-intent"]                # record type tag for easy querying later
    version: int                                   # schema version for evolvability
    created_at: str                                # RFC3339/ISO8601 UTC timestamp
    source: Literal["user-input+llm"]              # provenance tag to aid observability
    raw_input: str                                 # original user text, verbatim for traceability
    data: dict                                     # arbitrarily nested payload; schema below is a guideline
    meta: NotRequired[dict]                        # optional bag for deploy, notebook, or operator notes

# Define the structured payload for the current step.
class ToolCandidate(TypedDict):
    tool_name: str                                 # canonical tool name, e.g., "Salesforce"
    developer: NotRequired[str]                    # org or vendor, e.g., "Salesforce, Inc."
    website_domain: NotRequired[str]               # bare domain, e.g., "salesforce.com"
    website_url: NotRequired[str]                  # full URL, e.g., "https://www.salesforce.com"
    logo_url: NotRequired[str]                     # logo URL; see comment on Clearbit below
    confidence: NotRequired[float]                 # model-reported confidence between 0 and 1
    notes: NotRequired[str]                        # brief disambiguation notes for UX

class ToolResolution(TypedDict):
    # The model can propose multiple candidates; we keep them all for auditability.
    candidates: list[ToolCandidate]                # ordered best-first by the model
    selected_index: int                            # index into candidates that we will persist as "current pick"
    disambiguation: NotRequired[str]               # short text to show the user when there's ambiguity
    citations: NotRequired[list[str]]              # optional URLs the model relied on (best-effort)

print("TypedDict schemas defined")

TypedDict schemas defined


In [None]:
# read user input for the POC run
# In your UI this would come from the single-paragraph capture screen. Here we simulate with "hibob"
user_text = input("Describe the data tool you want (e.g., 'I need Salesforce customer data'): ").strip()  # interactive entry point

# Fail early if nothing was provided, so we don't write empty records.
if not user_text:
    raise ValueError("Please provide a short description of the tool you want.")
    
print(f"User input captured: '{user_text}'")

User input captured: 'hibob'


In [35]:
# LLM call to infer tool identity and normalize fields
# We ask Gemini to return strict JSON that matches our schema, so later steps can be purely mechanical.
# We do not fetch logos from the web here. Instead we heuristically set logo_url based on domain using
# the de facto Clearbit Logo pattern: https://logo.clearbit.com/<domain> which serves favicons/logos without auth.
# This is good enough for a local POC and can be swapped for your own asset pipeline later.

# Build a compact, instruction-only prompt to minimize model verbosity and reduce parse risk.
prompt = f"""
You are a resolver that identifies a software/data tool from noisy user text.

USER_TEXT: {user_text}

Produce JSON with this structure:
{{
  "candidates": [
    {{
      "tool_name": "<canonical tool name>",
      "developer": "<vendor or developer, if known>",
      "website_domain": "<registrable domain like salesforce.com, if known>",
      "website_url": "<full homepage URL, if known>",
      "logo_url": "",  // leave empty; caller may set Clearbit-style logo from domain
      "confidence": 0.0,  // 0 to 1
      "notes": "<one-line disambiguation or clarifying note>"
    }}
  ],
  "selected_index": 0,
  "disambiguation": "<one short sentence if multiple tools are plausible>",
  "citations": []  // optional URLs if you used known references
}}

Rules:
- If multiple tools match, include the top 2-3 candidates in descending confidence and set selected_index accordingly.
- Prefer official vendor domains, not community links.
- If unsure, still return best-effort candidates and explain uncertainty in 'disambiguation'.
- Return only JSON. No extra text.
"""

# Ask the model for strictly-typed JSON using the SDK's JSON mode.
resp = client.models.generate_content(  # high-level text generation entry point from the google-genai SDK
    model=model_name,                   # previously selected model, e.g., "gemini-2.5-flash" (fast and cheap)
    contents=prompt,                    # instruction block above
    config=types.GenerateContentConfig( # typed config makes request explicit and helps future readers
        response_mime_type="application/json",  # request JSON so we can parse programmatically
        response_schema=ToolResolution,         # provide a Python TypedDict to enforce shape client-side
        temperature=0,                          # deterministic output reduces flakiness in stateful flows
        max_output_tokens=512,                  # enough headroom for a few candidates without overrun
    ),
)

# Pull the parsed Python object directly. The SDK maps the JSON into native types when a schema is given.
tool_resolution = resp.parsed  # type: ignore[assignment]  # parsed is a dict matching ToolResolution

# Display raw JSON for inspection during POC runs.
print("LLM Resolution Result:")
print(json.dumps(tool_resolution, indent=2))

LLM Resolution Result:
{
  "candidates": [
    {
      "tool_name": "HiBob",
      "developer": "HiBob",
      "website_domain": "hibob.com",
      "website_url": "https://www.hibob.com/",
      "logo_url": "",
      "confidence": 1.0,
      "notes": "HR and people management platform"
    }
  ],
  "selected_index": 0
}


In [36]:
# post-process: sanitize domains, populate logo URL, pick selected candidate
# Create a small helper lambda inline (not a def) to keep the "no functions" vibe while staying readable.
sanitize_domain = lambda d: re.sub(r"^https?://", "", d or "").split("/")[0].lower() if d else ""  # strip scheme and paths

# Normalize each candidate
normalized_candidates = []  # we'll fill this with cleaned copies
for cand in tool_resolution.get("candidates", []):  # iterate proposed candidates in given order
    domain = cand.get("website_domain") or sanitize_domain(cand.get("website_url", ""))  # prefer explicit domain
    domain = sanitize_domain(domain)  # ensure bare domain shape
    logo_url = cand.get("logo_url") or (f"https://logo.clearbit.com/{domain}" if domain else "")  # Clearbit heuristic for POC
    tool_name = cand.get("tool_name", "").strip()  # canonical name
    developer = (cand.get("developer") or "").strip()  # vendor optional
    website_url = cand.get("website_url") or (f"https://{domain}" if domain else "")  # synthesize URL if domain exists
    confidence = float(cand.get("confidence", 0.0))  # numeric
    notes = (cand.get("notes") or "").strip()  # one-liner aid
    
    # New: deterministic candidateId while keeping everything else the same
    name_slug = slugify(tool_name) or "unknown"  # readable component
    hash_input = f"{tool_name}|{domain}".encode("utf-8")  # bind identity to canonical name+domain
    short_hash = hashlib.sha256(hash_input).hexdigest()[:12]  # compact, collision-resistant tail
    candidate_id = f"{name_slug}-{short_hash}"  # final candidateId
   
    # Compose normalized candidate WITH candidateId added
    normalized = {
        "candidateId": candidate_id,                    # <= new stable identifier
        "tool_name": tool_name,                         # trim whitespace noise
        "developer": developer,                         # optional field
        "website_domain": domain,                        # normalized bare domain
        "website_url": website_url,                     # synthesize URL if missing
        "logo_url": logo_url,                           # derived if we have a domain
        "confidence": confidence,                       # ensure numeric type
        "notes": notes,                                 # keep short helper text
    }

    normalized_candidates.append(normalized)  # collect normalized candidate

# Fall back to a single unknown candidate if the model returned nothing for resilience.
if not normalized_candidates:  # guardrail for empty model output
    normalized_candidates = [{
        "candidateId": "unknown-000000000000",  # placeholder to keep shape stable
        "tool_name": "Unknown",
        "developer": "",
        "website_domain": "",
        "website_url": "",
        "logo_url": "",
        "confidence": 0.0,
        "notes": "No candidates returned by model",
    }]

# Clamp selected_index into range to avoid index errors if the model provided an out-of-range value.
selected_index = int(tool_resolution.get("selected_index", 0))  # parse to int
if selected_index < 0 or selected_index >= len(normalized_candidates):  # simple bounds check
    selected_index = 0  # default to first candidate

# Compose the final structured payload that we will store.
capture_payload = {
    "candidates": normalized_candidates,                        # normalized list
    "selected_index": selected_index,                           # safe index
    "disambiguation": tool_resolution.get("disambiguation", ""),# carry through for the UI
    "citations": tool_resolution.get("citations", []),          # optional links for traceability
}

# Pretty-print what we'll show to the user next (name, developer, domain, logo).
selected = normalized_candidates[selected_index]  # the current pick
print("Selected tool for confirmation:")
print(f"- Name: {selected['tool_name']}")
print(f"- Developer: {selected['developer'] or 'Unknown'}")
print(f"- Domain: {selected['website_domain'] or 'Unknown'}")
print(f"- Logo: {selected['logo_url'] or 'Not available'}")
print(f"- Candidate ID: {selected['candidateId']}")
print(f"- Confidence: {selected['confidence']}")
print(f"- Notes: {selected['notes']}")

Selected tool for confirmation:
- Name: HiBob
- Developer: HiBob
- Domain: hibob.com
- Logo: https://logo.clearbit.com/hibob.com
- Candidate ID: hibob-8f11b0239b49
- Confidence: 1.0
- Notes: HR and people management platform


In [37]:
# persist to local state: create folder, write one-per-record JSON, append to JSONL index
# Use an append-only JSON Lines file for an audit trail, plus per-record JSON files for random access.

# Decide storage locations relative to the current notebook working directory.
base_dir = "capture-intent"  # project-level folder for this stage
os.makedirs(base_dir, exist_ok=True)  # idempotent create so reruns don't fail

# Generate a time-sortable unique ID. ULID keeps embedded timestamp which helps with chronological listings.
record_id = str(ulid.new())  # e.g., "01J6YP4ZQF2F7PAZ7QJX4P1C6H"

# Compose the record envelope with schema versioning for long-term evolvability.
record: IntentRecord = {
    "id": record_id,                                 # unique identifier for this capture
    "kind": "capture-intent",                        # record type tag
    "version": 1,                                    # bump when you change the shape in breaking ways
    "created_at": datetime.now(timezone.utc).isoformat(),  # precise UTC timestamp with tzinfo
    "source": "user-input+llm",                      # basic provenance for observability
    "raw_input": user_text,                          # store the exact user words
    "data": capture_payload,                         # normalized structured content from the model
    "meta": {                                        # optional metadata bag for this notebook run
        "model": model_name,                         # which base model produced this capture
        "sdk": "google-genai",                       # which client stack we used
        "notebook": "capture_intent_poc_v2",         # identify this workflow version
    },
}

# Write a per-record JSON file named by ID for random access and debugging.
per_record_path = os.path.join(base_dir, f"{record_id}.json")  # one file per record
with open(per_record_path, "w", encoding="utf-8") as f:        # open the path for writing text
    json.dump(record, f, ensure_ascii=False, indent=2)          # pretty JSON for human inspection

# Also append to a project-level JSONL index for streaming analytics or simple grepping.
index_path = os.path.join(base_dir, "index.jsonl")             # append-only ledger
with open(index_path, "a", encoding="utf-8") as idx:           # open file in append mode
    idx.write(json.dumps(record, ensure_ascii=False) + "\n")    # write compact JSON on a single line

print(f"Wrote record {record_id} to:")
print(f" - {per_record_path}")
print(f" - {index_path} (appended)")

Wrote record 01K317V3K6J1TZEBK32SZZKCZK to:
 - capture-intent/01K317V3K6J1TZEBK32SZZKCZK.json
 - capture-intent/index.jsonl (appended)


In [38]:
# minimal UX echo suitable for your front end to render
# This is intentionally tiny: your UI can lift these fields directly for the confirmation pane.

display_payload = {
    "id": record["id"],                               # send the ID to the front end for follow-up actions
    "tool_name": selected["tool_name"],               # primary label
    "developer": selected["developer"],               # secondary label
    "domain": selected["website_domain"],             # used for linking and further discovery
    "logo": selected["logo_url"],                     # image src candidate for the UI
    "disambiguation": record["data"].get("disambiguation", ""),  # optional helper text
}

print("Final display payload for UI:")
print(json.dumps(display_payload, indent=2))  # easy to pick up by the front end

Final display payload for UI:
{
  "id": "01K317V3K6J1TZEBK32SZZKCZK",
  "tool_name": "HiBob",
  "developer": "HiBob",
  "domain": "hibob.com",
  "logo": "https://logo.clearbit.com/hibob.com",
  "disambiguation": ""
}
