# LangGraph translation demo - Multi-Step Pipeline

This notebook demonstrates the enhanced **multi-step "generate-and-filter"** translation pipeline.

The new pipeline has 6 stages:
1. **analyse_sense** - Understand semantic nuances
2. **translate_definition** - Translate definition with context
3. **translate_all_lemmas** - Direct translation of each lemma
4. **expand_synonyms** - Broaden candidate pool in target language
5. **filter_synonyms** - Quality check to remove imperfect matches
6. **assemble_result** - Combine all outputs into final synset

This approach generates high-quality synsets (sets of synonymous literals) rather than a single "headword".

In [1]:
from pathlib import Path
import json
from pprint import pprint

DATA_PATH = Path("../examples/serbian_english_synset_pairs_enhanced.json")
with DATA_PATH.open("r", encoding="utf-8") as f:
    dataset = json.load(f)

pairs = dataset["pairs"]
metadata = dataset.get("metadata", {})

print(f"Loaded {len(pairs)} pairs from {DATA_PATH}")
print("Metadata snapshot:")
pprint(metadata)

sample_pair = pairs[0]
print("\nFirst pair keys:", list(sample_pair.keys()))

Loaded 27 pairs from ..\examples\serbian_english_synset_pairs_enhanced.json
Metadata snapshot:
{'created_by': 'Serbian WordNet Synset Browser',
 'description': 'Enhanced export with Serbian and English relations for '
                'translation context',
 'export_timestamp': '2025-07-29T13:57:13.380980',
 'format_version': '2.0',
 'includes_metadata': True,
 'includes_relations': True,
 'total_pairs': 27}

First pair keys: ['serbian_id', 'serbian_synonyms', 'serbian_definition', 'serbian_usage', 'serbian_pos', 'serbian_domain', 'serbian_relations', 'english_id', 'english_definition', 'english_lemmas', 'english_examples', 'english_pos', 'english_name', 'english_relations', 'pairing_metadata']


In [2]:
pprint({
    "english_id": sample_pair.get("english_id"),
    "english_lemmas": sample_pair.get("english_lemmas"),
    "english_definition": sample_pair.get("english_definition"),
    "english_examples": sample_pair.get("english_examples"),
    "serbian_synonyms": sample_pair.get("serbian_synonyms"),
    "serbian_definition": sample_pair.get("serbian_definition"),
    "serbian_usage": sample_pair.get("serbian_usage"),
})

{'english_definition': 'an establishment consisting of a building or complex '
                       'of buildings where an organization for the promotion '
                       'of some cause is situated',
 'english_examples': [],
 'english_id': 'ENG30-03574555-n',
 'english_lemmas': ['institution'],
 'serbian_definition': 'zgrada u kojoj se nalazi organizaciona jedinica neke '
                       'grane javnog poslovanja',
 'serbian_synonyms': ['ustanova'],
 'serbian_usage': ''}


In [3]:
import importlib
import ollama
import wordnet_autotranslate.pipelines.langgraph_translation_pipeline as lg_module

lg_module = importlib.reload(lg_module)
LangGraphTranslationPipeline = lg_module.LangGraphTranslationPipeline

PREFERRED_OLLAMA_MODEL = "gpt-oss:120b"
OLLAMA_TIMEOUT = 180  # seconds
OLLAMA_TEMPERATURE = 0.0

try:
    model_list_response = ollama.list()
    available_models = {item.model for item in model_list_response.models}
except Exception as exc:  # pragma: no cover - depends on local runtime
    raise RuntimeError(
        "Could not reach the local Ollama daemon. Start it with `ollama serve`."
    ) from exc

if not available_models:
    raise RuntimeError(
        "No Ollama models are installed. Pull one with `ollama pull <model>` before running this cell."
    )

if PREFERRED_OLLAMA_MODEL in available_models:
    ollama_model = PREFERRED_OLLAMA_MODEL
else:
    ollama_model = sorted(available_models)[0]
    print(
        f"Preferred model '{PREFERRED_OLLAMA_MODEL}' not found. "
        f"Falling back to '{ollama_model}'."
    )

pipeline = LangGraphTranslationPipeline(
    source_lang="en",
    target_lang="sr",
    model=ollama_model,
    temperature=OLLAMA_TEMPERATURE,
    timeout=OLLAMA_TIMEOUT,
)

print(f"Using Ollama model: {ollama_model}")

Using Ollama model: gpt-oss:120b


In [4]:
preview_limit = 200


def preview_text(text: str | None, limit: int = preview_limit) -> str:
    if not text:
        return ""
    return text[:limit] + ("… [truncated]" if len(text) > limit else "")


synset_input = {
    "id": sample_pair.get("english_id"),
    "english_id": sample_pair.get("english_id"),
    "lemmas": sample_pair.get("english_lemmas", []),
    "definition": sample_pair.get("english_definition", ""),
    "examples": sample_pair.get("english_examples", []),
    "pos": sample_pair.get("english_pos"),
}

print("=" * 70)
print("TRANSLATING SYNSET WITH MULTI-STEP PIPELINE")
print("=" * 70)
print(f"Input synset ID: {synset_input.get('id')}")
print(f"English lemmas: {synset_input.get('lemmas')}")
print(f"Definition: {synset_input.get('definition')}")
print("\nRunning through 6-stage pipeline...")
print("  Stage 1: analyse_sense")
print("  Stage 2: translate_definition")
print("  Stage 3: translate_all_lemmas (NEW)")
print("  Stage 4: expand_synonyms (NEW)")
print("  Stage 5: filter_synonyms (NEW)")
print("  Stage 6: assemble_result")
print("=" * 70)

result = pipeline.translate_synset(synset_input)

translation = result.get("translation", "")
definition_translation = result.get("definition_translation", "")
translated_synonyms = result.get("translated_synonyms", [])
examples = result.get("examples", [])
notes = result.get("notes")
curator_summary = result.get("curator_summary", "")
raw_response = result.get("raw_response", "")

print("\n" + "=" * 70)
print("RESULTS")
print("=" * 70)
print(f"Representative literal: {translation}")
print(f"Final synset ({len(translated_synonyms)} literals): {translated_synonyms}")
print(f"Example count: {len(examples)}")
print(f"Definition translation length: {len(definition_translation)} characters")

print("\n" + "=" * 70)
print("CURATOR SUMMARY")
print("=" * 70)
print(curator_summary)

print("\n" + "=" * 70)
print("DEFINITION TRANSLATION")
print("=" * 70)
print(definition_translation)

# Show pipeline progression
print("\n" + "=" * 70)
print("PIPELINE STAGE DETAILS")
print("=" * 70)

payload = result.get("payload", {})
initial_payload = payload.get("initial_translation", {})
expansion_payload = payload.get("expansion", {})
filtering_payload = payload.get("filtering", {})

initial_translations = initial_payload.get("initial_translations", [])
expanded_synonyms = expansion_payload.get("expanded_synonyms", [])
filtered_synonyms = filtering_payload.get("filtered_synonyms", [])

print(f"\n📊 Stage 3 - Initial Translations ({len(initial_translations)} lemmas):")
for i, trans in enumerate(initial_translations, 1):
    print(f"  {i}. {trans}")

print(f"\n🔍 Stage 4 - Expanded Candidates ({len(expanded_synonyms)} synonyms):")
for i, syn in enumerate(expanded_synonyms, 1):
    print(f"  {i}. {syn}")

print(f"\n✅ Stage 5 - Filtered Results ({len(filtered_synonyms)} final literals):")
for i, lit in enumerate(filtered_synonyms, 1):
    print(f"  {i}. {lit}")

# Show what was removed during filtering
removed = set(expanded_synonyms) - set(filtered_synonyms)
if removed:
    print(f"\n❌ Removed during filtering ({len(removed)} items):")
    for item in sorted(removed):
        print(f"  - {item}")
else:
    print("\n✓ No items removed during filtering (all candidates passed validation)")

if examples:
    print("\n" + "=" * 70)
    print("EXAMPLES")
    print("=" * 70)
    for i, ex in enumerate(examples[:3], 1):
        print(f"{i}. {preview_text(ex)}")
    if len(examples) > 3:
        print(f"... ({len(examples)} total)")

if notes:
    print("\n" + "=" * 70)
    print("NOTES")
    print("=" * 70)
    print(preview_text(notes))

print("\n" + "=" * 70)
print("STAGE LOGS (Preview)")
print("=" * 70)
logs = result.get("payload", {}).get("logs", {})
for stage, log in logs.items():
    if not log:
        continue
    print(f"\n[{stage.upper()}]")
    print(f"  Prompt preview: {preview_text(log.get('prompt'), 150)}")
    print(f"  Response preview: {preview_text(log.get('raw_response_preview'), 150)}")

print("\n" + "=" * 70)
print("FULL RAW RESPONSE (Last Stage)")
print("=" * 70)
print(preview_text(raw_response, 400))

TRANSLATING SYNSET WITH MULTI-STEP PIPELINE
Input synset ID: ENG30-03574555-n
English lemmas: ['institution']
Definition: an establishment consisting of a building or complex of buildings where an organization for the promotion of some cause is situated

Running through 6-stage pipeline...
  Stage 1: analyse_sense
  Stage 2: translate_definition
  Stage 3: translate_all_lemmas (NEW)
  Stage 4: expand_synonyms (NEW)
  Stage 5: filter_synonyms (NEW)
  Stage 6: assemble_result

RESULTS
Representative literal: sedište
Final synset (1 literals): ['sedište']
Example count: 2
Definition translation length: 91 characters

CURATOR SUMMARY
Representative literal (sr): sedište
Definition translation: zgrada ili skup zgrada u kome se nalazi organizacija posvećena promovisanju određenog cilja
Lexname: noun.artifact
Synset literals (1 total):
  • sedište
Example sentences: 2 (showing first)
  “Sedište udruženja za zaštitu životne sredine je u jednoj zgradi u centru grada.”
Notes: Gloss kept concise 

In [5]:
# Extract final synset for simple JSON output
# The new pipeline produces a clean list of validated synonyms

simple_output = {
    "english_id": synset_input.get("id"),
    "representative_literal": translation,  # First literal (for convenience)
    "synset_literals": translated_synonyms,  # The actual synset
    "literal_count": len(translated_synonyms),
    "definition_translation": definition_translation,
    "pipeline_stages": {
        "initial_translations": len(initial_translations),
        "expanded_candidates": len(expanded_synonyms),
        "filtered_results": len(filtered_synonyms),
        "removal_rate": f"{len(removed)}/{len(expanded_synonyms)}" if expanded_synonyms else "0/0"
    }
}

output_path = Path("simple_translation_output.json")
with output_path.open("w", encoding="utf-8") as fp:
    json.dump(simple_output, fp, ensure_ascii=False, indent=2)

print("=" * 70)
print("SIMPLE JSON OUTPUT")
print("=" * 70)
print(f"Saved to: {output_path.resolve()}\n")
print(json.dumps(simple_output, ensure_ascii=False, indent=2))

SIMPLE JSON OUTPUT
Saved to: E:\Github\wordnet_autotranslate\notebooks\simple_translation_output.json

{
  "english_id": "ENG30-03574555-n",
  "representative_literal": "sedište",
  "synset_literals": [
    "sedište"
  ],
  "literal_count": 1,
  "definition_translation": "zgrada ili skup zgrada u kome se nalazi organizacija posvećena promovisanju određenog cilja",
  "pipeline_stages": {
    "initial_translations": 1,
    "expanded_candidates": 4,
    "filtered_results": 1,
    "removal_rate": "3/4"
  }
}


## Understanding the Multi-Step Pipeline

### Generate-and-Filter Approach

The new pipeline uses a **3-stage synonym generation** process:

1. **Generate Initial** (`translate_all_lemmas`): Direct translation of each English lemma
2. **Expand** (`expand_synonyms`): Find additional synonyms in target language
3. **Filter** (`filter_synonyms`): Strict validation to remove imperfect matches

### Benefits

✅ **Higher Quality**: Three-stage validation ensures precision  
✅ **Broader Coverage**: Expansion finds native synonyms, not just translations  
✅ **Traceability**: Full audit trail showing progression  
✅ **No Headword**: Output is a true synset (set of synonymous literals)

### Next steps

- Translate multiple synsets: `pipeline.translate(list_of_synsets)`
- Use streaming for large batches: `pipeline.translate_stream(synsets)`
- Examine the `filtering_payload` to see which candidates were rejected
- Compare quality against the old single-step approach

## 🔍 Accessing Full LLM Outputs (Untruncated)

The pipeline preserves **two versions** of LLM call data:

- **`result["payload"]["logs"]`** - Truncated summaries for quick viewing (500 chars)
- **`result["payload"]["calls"]`** - **Full, untruncated** LLM interactions

The truncation exists to prevent memory bloat when processing hundreds of synsets, but you can always access the complete data via the `calls` dictionary.

In [6]:
# Example: View truncated logs (quick summary)
print("=== TRUNCATED LOGS (for quick viewing) ===\n")
print("Filtering stage log (truncated):")
truncated_log = result["payload"]["logs"]["filtering"]
print(f"  Raw response preview: {truncated_log.get('raw_response_preview', 'N/A')[:100]}...")
print(f"  (Response truncated at {len(truncated_log.get('raw_response_preview', ''))} chars)")


=== TRUNCATED LOGS (for quick viewing) ===

Filtering stage log (truncated):
  Raw response preview: {
  "filtered_synonyms": ["sedište"],
  "removed": [
    {
      "word": "ustanova",
      "reason":...
  (Response truncated at 613 chars)


In [7]:
# Example: Access FULL untruncated data
print("\n=== FULL UNTRUNCATED CALL DATA ===\n")
print("Filtering stage call (complete):")
full_call = result["payload"]["calls"]["filtering"]
print(f"  Stage: {full_call['stage']}")
print(f"  Full raw response length: {len(full_call['raw_response'])} chars")
print(f"  Full raw response:\n{full_call['raw_response']}")
print(f"\n  Parsed payload: {full_call['payload']}")


=== FULL UNTRUNCATED CALL DATA ===

Filtering stage call (complete):
  Stage: synonym_filtering
  Full raw response length: 649 chars
  Full raw response:
{
  "filtered_synonyms": ["sedište"],
  "removed": [
    {
      "word": "ustanova",
      "reason": "Broader/abstract meaning; refers to an institution as an organization, not specifically to the building that houses it."
    },
    {
      "word": "institucija",
      "reason": "Abstract noun denoting an institution; does not necessarily denote a concrete building or group of buildings."
    },
    {
      "word": "centar",
      "reason": "Can denote a physical building but also an abstract concept (e.g., ‘center of activity’); not a perfect synonym for a concrete establishment housing an organization."
    }
  ],
  "confidence": "high"
}

  Parsed payload: {'filtered_synonyms': ['sedište'], 'removed': [{'word': 'ustanova', 'reason': 'Broader/abstract meaning; refers to an institution as an organization, not specifically to the b

In [None]:
# Save full logs to file for later analysis
import json
from pathlib import Path

output_dir = Path("output")
output_dir.mkdir(exist_ok=True)

full_logs_path = output_dir / "full_llm_logs.json"

# Extract all full calls for detailed analysis
full_logs = {
    "synset_id": result["source"]["id"],
    "translation": result["translation"],
    "all_stages": {
        stage: {
            "prompt": call.get("prompt", ""),
            "raw_response": call.get("raw_response", ""),
            "parsed_payload": call.get("payload", {}),
            "messages": call.get("messages", [])
        }
        for stage, call in result["payload"]["calls"].items()
    }
}

with open(full_logs_path, "w", encoding="utf-8") as f:
    json.dump(full_logs, f, indent=2, ensure_ascii=False)

print(f"✅ Full untruncated logs saved to: {full_logs_path}")
print(f"📊 File size: {full_logs_path.stat().st_size:,} bytes")
print(f"\n💡 Tip: These logs include:")
print("   - Complete prompts for each stage")
print("   - Full raw LLM responses (no truncation)")
print("   - Parsed payloads with validation results")
print("   - Complete message history")

### 📝 Why Truncate Logs?

**Memory & Performance Reasons:**

1. **Memory Bloat**: When processing 1,000+ synsets, full responses can consume GBs of RAM
2. **Serialization Size**: Saving results to JSON becomes impractical with full responses
3. **Quick Inspection**: Truncated logs allow fast debugging without scrolling through pages

**Best Practice:**
- Use `payload["logs"]` for quick debugging during development
- Use `payload["calls"]` to save full data for critical synsets
- Export full logs to separate files for detailed analysis (as shown above)

In [None]:
# Using the log utilities for easier log management
from wordnet_autotranslate.utils.log_utils import save_full_logs, analyze_stage_lengths

# Save full logs with one function call
log_path = save_full_logs(result, output_path="output/full_logs_example.json")
print(f"✅ Saved to: {log_path}")

# Analyze response sizes per stage
print("\n📊 Response sizes by stage:")
lengths = analyze_stage_lengths(result)
for stage, length in sorted(lengths.items(), key=lambda x: x[1], reverse=True):
    print(f"  {stage:20} {length:>8,} chars")