<a href="https://colab.research.google.com/github/sr606/LLM/blob/main/mermaid_trail_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!npm install -g mermaid-mcp

[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K
added 1 package in 2s
[1G[0K⠏[1G[0K

In [None]:
%%writefile diagram.mmd
graph TD
  A-->B
  B-->C
  C-->A

Writing diagram.mmd


In [None]:
!bash -c "export PATH=/usr/bin:$PATH; GLOBAL_NPM_BIN=$(/usr/bin/npm bin -g); export PATH=$GLOBAL_NPM_BIN:$PATH; mmdc -i diagram.mmd -o diagram.svg"

/bin/bash: line 1: /usr/bin/npm: No such file or directory
bash: line 1: mmdc: command not found


In [5]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Parse a DataStage-like pseudo code text file and render a flowchart as PDF using GraphViz.

Usage:
    python pseudo_to_flowchart.py input.txt -o vor_job -f pdf --view

Notes:
- Groups nodes by StageType (clusters).
- Connects stages by datasets: if Stage A outputs dataset_4 and Stage B inputs dataset_4, creates edge A -> B.
- Edge labels show dataset and Link name (if found).
"""

from __future__ import annotations
import argparse
import os
import re
from collections import defaultdict
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Tuple

from graphviz import Digraph

# ----------------------------
# Data structures
# ----------------------------

@dataclass
class DatasetRef:
    dataset_id: str
    dataset_name: Optional[str] = None
    link_name: Optional[str] = None

@dataclass
class Stage:
    stage_id: str
    display_name: str
    bracket_kind: Optional[str] = None      # e.g., CUSTOMSTAGE / HASHEDFILESTAGE (from header)
    stage_type: Optional[str] = None        # e.g., OracleConnector / CTransformerStage (from body)
    lines: List[str] = field(default_factory=list)
    inputs: List[DatasetRef] = field(default_factory=list)
    outputs: List[DatasetRef] = field(default_factory=list)

# ----------------------------
# Helpers
# ----------------------------

def sanitize_id(text: str) -> str:
    """Make a safe GraphViz node id."""
    text = re.sub(r"[^A-Za-z0-9_]+", "_", text.strip())
    text = re.sub(r"_{2,}", "_", text)
    return text.strip("_") or "node"

def esc(text: Optional[str]) -> str:
    """Escape < and > unless you intend to use HTML-like labels."""
    if not text:
        return ""
    return text.replace("<", "&lt;").replace(">", "&gt;")

def parse_dataset_tokens(segment: str) -> List[DatasetRef]:
    """
    Parse patterns like:
      Input: ← dataset_4 (Tfm_LoadRecords) (Link: Load_HFVehicleoffRoad)
      Output: → dataset_3 (HF_FACT_VOR_DATA)
    Returns list of DatasetRef.
    """
    # Split by commas or just parse all occurrences
    refs: List[DatasetRef] = []

    # Regex finds: dataset_# (Name) optional (Link: Xxx)
    # dataset id
    ds_pattern = re.compile(
        r"(dataset_\d+)\s*(?:\(([^)]+)\))?\s*(?:\(Link:\s*([^)]+)\))?",
        re.IGNORECASE
    )

    for m in ds_pattern.finditer(segment):
        ds_id = m.group(1)
        ds_name = m.group(2).strip() if m.group(2) else None
        link = m.group(3).strip() if m.group(3) else None
        refs.append(DatasetRef(dataset_id=ds_id, dataset_name=ds_name, link_name=link))

    return refs

# ----------------------------
# Parser
# ----------------------------

def parse_pseudocode(path: str) -> List[Stage]:
    """
    Parses a pseudo code file into a list of stages with inputs/outputs and types.
    Recognizes:
      // --- [CUSTOMSTAGE : Name] [Lines ...] ---
      StageType: OracleConnector
      Input: ← dataset_... (Name) (Link: ...)
      Output: → dataset_... (Name)
    """
    with open(path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    stages: List[Stage] = []
    current: Optional[Stage] = None
    seen_ids: Dict[str, int] = defaultdict(int)

    # Header pattern for stages
    header_re = re.compile(
        r"^\s*//\s*---\s*\[\s*([A-Z]+STAGE)\s*:\s*([^\]]+?)\s*\]\s*\[.*?\]\s*---\s*$",
        re.IGNORECASE
    )

    for raw in lines:
        line = raw.rstrip("\n")

        # New stage header?
        h = header_re.match(line)
        if h:
            bracket_kind = h.group(1).strip()  # e.g., CUSTOMSTAGE, HASHEDFILESTAGE, TRANSFORMERSTAGE
            display_name = h.group(2).strip()

            # Create a unique stage_id
            base_id = sanitize_id(display_name) or "Stage"
            seen_ids[base_id] += 1
            stage_id = base_id if seen_ids[base_id] == 1 else f"{base_id}_{seen_ids[base_id]}"

            current = Stage(
                stage_id=stage_id,
                display_name=display_name,
                bracket_kind=bracket_kind,
            )
            stages.append(current)
            continue

        if current is None:
            # Outside any stage: skip
            continue

        current.lines.append(line)

        # StageType
        if line.strip().lower().startswith("stagetype:"):
            # e.g., StageType: OracleConnector
            st = line.split(":", 1)[1].strip()
            current.stage_type = st

        # Input lines
        if line.strip().lower().startswith("input:"):
            # capture the entire line (could contain multiple datasets)
            in_refs = parse_dataset_tokens(line)
            current.inputs.extend(in_refs)

        # Output lines
        if line.strip().lower().startswith("output:"):
            out_refs = parse_dataset_tokens(line)
            current.outputs.extend(out_refs)

    return stages

# ----------------------------
# Graph building
# ----------------------------

def build_graph(stages: List[Stage],
                rankdir: str = "LR",
                theme: str = "default") -> Digraph:
    """
    Build a Digraph from stages, connecting producers and consumers through datasets.
    """
    # cluster color themes by StageType (body)
    cluster_palette = {
        "OracleConnector":   ("#5DADE2", "#EBF5FB"),
        "CTransformerStage": ("#58D68D", "#E9F7EF"),
        "CHashedFileStage":  ("#AF7AC5", "#F5EEF8"),
        "CSeqFileStage":     ("#5499C7", "#EBF5FB"),
        # fallback
        "__default__":       ("#F4D03F", "#FEF9E7"),
    }

    dot = Digraph(
        name="ETL_Flow",
        graph_attr={"rankdir": rankdir, "fontname": "Helvetica"},
        node_attr={
            "shape": "box", "style": "rounded,filled", "fontname": "Helvetica",
            "fontsize": "10", "margin": "0.2,0.15", "fillcolor": "#FFFFFF", "color": "#2E4053"
        },
        edge_attr={"color": "#555555", "arrowsize": "0.8"}
    )

    # Group stages by StageType (body)
    by_type: Dict[str, List[Stage]] = defaultdict(list)
    for s in stages:
        key = s.stage_type or (s.bracket_kind or "__default__")
        by_type[key].append(s)

    # Keep a mapping from dataset -> producer stage(s)
    producers: Dict[str, List[Tuple[Stage, DatasetRef]]] = defaultdict(list)
    # Keep a mapping from dataset -> consumer stage(s)
    consumers: Dict[str, List[Tuple[Stage, DatasetRef]]] = defaultdict(list)

    # Render clusters and nodes
    for s_type, s_list in by_type.items():
        border, bg = cluster_palette.get(s_type, cluster_palette["__default__"])
        with dot.subgraph(name=f"cluster_{sanitize_id(s_type)}") as c:
            c.attr(
                label=esc(s_type),
                style="rounded",
                color=border,
                bgcolor=bg
            )
            for s in s_list:
                label = f"{s.display_name}"
                if s.stage_type:
                    label += f"\n({s.stage_type})"
                elif s.bracket_kind:
                    label += f"\n({s.bracket_kind})"

                # Tooltip summarizing IO
                tooltip_parts = []
                if s.inputs:
                    tooltip_parts.append("Inputs: " + ", ".join([r.dataset_id for r in s.inputs]))
                if s.outputs:
                    tooltip_parts.append("Outputs: " + ", ".join([r.dataset_id for r in s.outputs]))
                tooltip = "; ".join(tooltip_parts) if tooltip_parts else None

                c.node(s.stage_id, label=esc(label), tooltip=esc(tooltip) if tooltip else None)

                # Track producers & consumers by dataset id
                for ref in s.outputs:
                    producers[ref.dataset_id].append((s, ref))
                for ref in s.inputs:
                    consumers[ref.dataset_id].append((s, ref))

    # Create edges producer -> consumer for shared datasets
    seen_edges = set()
    for ds_id, prod_list in producers.items():
        cons_list = consumers.get(ds_id, [])
        for (p_stage, p_ref) in prod_list:
            for (c_stage, c_ref) in cons_list:
                if p_stage.stage_id == c_stage.stage_id:
                    continue
                edge_key = (p_stage.stage_id, c_stage.stage_id, ds_id)
                if edge_key in seen_edges:
                    continue
                seen_edges.add(edge_key)

                # Build label: dataset + (Link names if any)
                labels = [ds_id]
                # Prefer consumer link if present, else producer link
                link_name = c_ref.link_name or p_ref.link_name
                if link_name:
                    labels.append(f"Link: {link_name}")
                # If dataset has a display alias, include once
                alias = c_ref.dataset_name or p_ref.dataset_name
                if alias and alias != ds_id:
                    labels.append(alias)

                edge_label = " | ".join(labels)
                dot.edge(p_stage.stage_id, c_stage.stage_id, label=esc(edge_label))

    return dot

# ----------------------------
# CLI
# ----------------------------

def main():
    ap = argparse.ArgumentParser(description="Convert pseudo code text file to flowchart (GraphViz).")
    ap.add_argument("input", help="Path to pseudo code text file")
    ap.add_argument("-o", "--output", default="flowchart", help="Output filename without extension (default: flowchart)")
    ap.add_argument("-d", "--directory", default=None, help="Output directory")
    ap.add_argument("-f", "--format", default="pdf", choices=["pdf", "png", "svg"], help="Output format (default: pdf)")
    ap.add_argument("--rankdir", default="LR", choices=["LR", "TB", "BT", "RL"], help="Graph layout direction (default: LR)")
    ap.add_argument("--view", action="store_true", help="Open the rendered file after creation")
    ap.add_argument("--cleanup", action="store_true", help="Remove intermediate files after render")
    args = ap.parse_args(['/content/LS_Sample_job1 1 2_detailed_pseudocode.txt', '--output', 'flowchart', '--format', 'pdf', '--cleanup'])

    if not os.path.exists(args.input):
        raise FileNotFoundError(f"Input file not found: {args.input}")

    stages = parse_pseudocode(args.input)

    if not stages:
        raise RuntimeError("No stages detected. Ensure the pseudo code contains stage headers like:\n"
                           "// --- [CUSTOMSTAGE : Name] [Lines ...] ---")

    dot = build_graph(stages, rankdir=args.rankdir)
    path = dot.render(filename=args.output, directory=args.directory, format=args.format, view=args.view, cleanup=args.cleanup)
    print(f"Rendered: {path}")

if __name__ == "__main__":
    main()

Rendered: flowchart.pdf


In [1]:
import re
from graphviz import Digraph

def parse_pseudocode(text):
    """
    Extracts stages, inputs, outputs from pseudocode blocks.
    """
    stages = []
    current_stage = None

    lines = text.splitlines()

    stage_header_pattern = re.compile(r'^\s*// --- \[(.*?)\]\s*')
    input_pattern = re.compile(r'Input:\s*←\s*(.*?)\s*\(')
    output_pattern = re.compile(r'Output:\s*→\s*(.*?)\s*\(')

    for line in lines:
        header_match = stage_header_pattern.search(line)
        if header_match:
            # Start new stage
            if current_stage:
                stages.append(current_stage)
            current_stage = {
                "name": header_match.group(1),
                "inputs": [],
                "outputs": []
            }

        if current_stage:
            input_match = input_pattern.search(line)
            if input_match:
                current_stage["inputs"].append(input_match.group(1).strip())

            output_match = output_pattern.search(line)
            if output_match:
                current_stage["outputs"].append(output_match.group(1).strip())

    # Add last stage
    if current_stage:
        stages.append(current_stage)

    return stages


def build_graph(stages):
    """
    Builds a Graphviz Diagram from parsed stages.
    """
    g = Digraph("JobFlow", format="png")
    g.attr(rankdir="LR", fontsize="10")

    # Add stage nodes
    for stg in stages:
        g.node(stg["name"], stg["name"], shape="box", style="rounded,filled", color="lightblue")

    # Create edges from inputs to stages and stages to outputs
    for stg in stages:
        for inp in stg["inputs"]:
            g.node(inp, inp, shape="ellipse", color="gray")
            g.edge(inp, stg["name"])

        for out in stg["outputs"]:
            g.node(out, out, shape="ellipse", color="gray")
            g.edge(stg["name"], out)

    return g


# -------------------------
# USE IT
# -------------------------
if __name__ == "__main__":
    with open("LS_Sample_job1 1 2_detailed_pseudocode.txt", "r") as f:
        content = f.read()

    stages = parse_pseudocode(content)
    graph = build_graph(stages)
    graph.render("Job_Stage_Diagram", cleanup=True)

    print("Diagram generated: Job_Stage_Diagram.png")



Diagram generated: Job_Stage_Diagram.png
