<a href="https://colab.research.google.com/github/sr606/LLM/blob/main/mermaid_v5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
fastapi
uvicorn
python-dotenv
openai>=1.0.0
graphviz


AZURE_OPENAI_API_KEY=your_key
AZURE_OPENAI_ENDPOINT=https://your-resource-name.openai.azure.com/
AZURE_OPENAI_API_VERSION=2024-02-15-preview
AZURE_OPENAI_DEPLOYMENT=your-deployment-name


#parser
def split_into_stages(text: str):
    """
    Splits pseudocode into stage blocks.
    Adjust marker if needed.
    """
    blocks = text.split("// --- [")
    return [block.strip() for block in blocks if block.strip()]


#llm_service
import os
import json
from dotenv import load_dotenv
from openai import AzureOpenAI

load_dotenv()


class LLMService:
    def __init__(self):

        self.client = AzureOpenAI(
            api_key=os.getenv("AZURE_OPENAI_API_KEY"),
            api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
            azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
        )

        self.deployment = os.getenv("AZURE_OPENAI_DEPLOYMENT")

        self.system_prompt = """
You are an ETL Stage Analyzer.

Extract:
- stage_name
- stage_type
- short transformation summary (3-5 bullet points)

Return valid JSON only in this format:

{
  "stage_name": "...",
  "stage_type": "...",
  "summary": [
      "bullet 1",
      "bullet 2",
      "bullet 3"
  ]
}

Do not include explanations.
Do not include markdown.
"""

    def analyze_stage(self, stage_block: str):

        response = self.client.chat.completions.create(
            model=self.deployment,
            temperature=0.0,
            messages=[
                {"role": "system", "content": self.system_prompt},
                {"role": "user", "content": stage_block}
            ],
        )

        content = response.choices[0].message.content.strip()

        try:
            return json.loads(content)
        except Exception:
            return {
                "error": "Invalid JSON from LLM",
                "raw_response": content
            }



#agent
import os
import json
from parser import split_into_stages
from llm_service import LLMService

INPUT_PATH = "../data/input/pseudocode.txt"
OUTPUT_PATH = "../data/output/metadata.json"


def run_agent():

    if not os.path.exists(INPUT_PATH):
        print("Input file not found.")
        return

    with open(INPUT_PATH, "r", encoding="utf-8") as f:
        pseudocode = f.read()

    stage_blocks = split_into_stages(pseudocode)

    if not stage_blocks:
        print("No stage blocks detected.")
        return

    llm = LLMService()
    results = []

    for idx, block in enumerate(stage_blocks):
        print(f"Processing stage {idx+1}/{len(stage_blocks)}")
        result = llm.analyze_stage(block)
        results.append(result)

    os.makedirs("../data/output", exist_ok=True)

    with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=4)

    print("Extraction complete.")
    print(f"Metadata saved at {OUTPUT_PATH}")


if __name__ == "__main__":
    run_agent()



#server
from fastapi import FastAPI
from agent import run_agent

app = FastAPI()

@app.post("/generate-diagram")
def generate():
    run_agent()
    return {"status": "success", "message": "Metadata generated"}


#client
import requests

response = requests.post("http://127.0.0.1:8000/generate-diagram")

print(response.json())


In [None]:
#graph_model

class Node:
    def __init__(self, name, stage_type, summary):
        self.name = name
        self.stage_type = stage_type
        self.summary = summary


class Graph:
    def __init__(self):
        self.nodes = {}
        self.edges = set()   # (source, target)

    def add_node(self, node: Node):
        self.nodes[node.name] = node

    def add_edge(self, source, target):
        if source != target:
            self.edges.add((source, target))


#graph_builder

import re
from graph_model import Graph, Node


def build_graph(pseudocode_text, stage_metadata_list):
    """
    Builds stage-to-stage graph deterministically.
    """

    graph = Graph()

    # -----------------------------
    # Extract stage blocks again
    # -----------------------------
    pattern = r"// --- \[(.*?)\] ---([\s\S]*?)(?=// --- \[|$)"
    matches = re.findall(pattern, pseudocode_text)

    dataset_producer = {}  # dataset_name -> stage_name
    dataset_consumers = {}  # dataset_name -> [stage_names]

    for header, body in matches:

        parts = header.split(":")
        if len(parts) < 2:
            continue

        stage_name = parts[1].strip()

        # Extract inputs
        inputs = re.findall(r"Input:\s*←\s*dataset_\d+\s*\((.*?)\)", body)

        for dataset in inputs:
            dataset_consumers.setdefault(dataset, []).append(stage_name)

        # Extract outputs
        outputs = re.findall(r"Output:\s*→\s*dataset_\d+\s*\((.*?)\)", body)

        for dataset in outputs:
            dataset_producer[dataset] = stage_name

    # -----------------------------
    # Add Nodes from metadata
    # -----------------------------
    for stage_data in stage_metadata_list:
        if "error" in stage_data:
            continue

        node = Node(
            name=stage_data["stage_name"],
            stage_type=stage_data["stage_type"],
            summary=stage_data["summary"]
        )
        graph.add_node(node)

    # -----------------------------
    # Create stage-to-stage edges
    # -----------------------------
    for dataset, producer in dataset_producer.items():
        consumers = dataset_consumers.get(dataset, [])

        for consumer in consumers:
            graph.add_edge(producer, consumer)

    return graph



#run_agent()

from graph_builder import build_graph


def run_agent():

    with open(INPUT_PATH, "r", encoding="utf-8") as f:
        pseudocode = f.read()

    stage_blocks = split_into_stages(pseudocode)

    llm = LLMService()
    metadata_results = []

    for block in stage_blocks:
        result = llm.analyze_stage(block)
        metadata_results.append(result)

    # Save metadata
    os.makedirs("../data/output", exist_ok=True)

    with open("../data/output/metadata.json", "w", encoding="utf-8") as f:
        json.dump(metadata_results, f, indent=4)

    # Build graph
    graph = build_graph(pseudocode, metadata_results)

    # Print graph info
    print("Nodes:")
    for node in graph.nodes.values():
        print("-", node.name)

    print("\nEdges:")
    for edge in graph.edges:
        print("-", edge)







# Extract link-file bindings

link_file_matches = re.findall(r"Link File \((.*?)\):\s*(.*)",body)

stage_link_files = {}
for link_name, file_name in link_file_matches:
  stage_link_files[link_name.strip()] = file_name.strip()



#pipeline_detector

from collections import defaultdict, deque


def detect_pipelines(graph):
    """
    Returns list of pipelines.
    Each pipeline is a set of stage names.
    """

    # Build adjacency list
    adjacency = defaultdict(set)

    for source, target in graph.edges:
        adjacency[source].add(target)
        adjacency[target].add(source)  # Undirected for connectivity

    visited = set()
    pipelines = []

    for node_name in graph.nodes.keys():

        if node_name in visited:
            continue

        queue = deque([node_name])
        component = set()

        while queue:
            current = queue.popleft()

            if current in visited:
                continue

            visited.add(current)
            component.add(current)

            for neighbor in adjacency[current]:
                if neighbor not in visited:
                    queue.append(neighbor)

        pipelines.append(component)

    return pipelines



from pipeline_detector import detect_pipelines

pipelines = detect_pipelines(graph)

print("\nDetected Pipelines:")
for idx, pipeline in enumerate(pipelines, start=1):
    print(f"\nPipeline {idx}:")
    for stage in pipeline:
        print("  -", stage)




#pipeline_name
from llm_service import LLMService


def generate_pipeline_name(graph, pipeline_stages):
    """
    Hybrid pipeline naming.
    pipeline_stages: set of stage names
    """

    # ---------------------------
    # Step 1: Deterministic Fallback
    # ---------------------------

    # Find source stages (no incoming edges)
    incoming = {stage: 0 for stage in pipeline_stages}

    for source, target in graph.edges:
        if target in incoming:
            incoming[target] += 1

    source_candidates = [stage for stage, count in incoming.items() if count == 0]

    if source_candidates:
        fallback_name = source_candidates[0]
    else:
        fallback_name = list(pipeline_stages)[0]

    # ---------------------------
    # Step 2: Build Structured Summary
    # ---------------------------

    pipeline_summary = []

    for stage in pipeline_stages:
        node = graph.nodes.get(stage)
        if not node:
            continue

        pipeline_summary.append({
            "stage_name": node.name,
            "stage_type": node.stage_type,
            "summary": node.summary
        })

    # ---------------------------
    # Step 3: Ask LLM For Better Name
    # ---------------------------

    llm = LLMService()

    prompt = f"""
You are generating a professional ETL pipeline name.

Based on the following structured pipeline stages,
generate a short professional name (max 6 words).

Return ONLY the name as plain text.
No explanation.

Pipeline Data:
{pipeline_summary}
"""

    try:
        response = llm.client.chat.completions.create(
            model=llm.deployment,
            temperature=0.0,
            messages=[
                {"role": "system", "content": "You generate concise pipeline names."},
                {"role": "user", "content": prompt}
            ],
        )

        name = response.choices[0].message.content.strip()

        # Basic cleanup
        name = name.replace("\n", "").strip()

        if len(name) > 2:
            return name

    except Exception:
        pass

    # ---------------------------
    # Step 4: Fallback
    # ---------------------------

    return fallback_name



from pipeline_namer import generate_pipeline_name


print("\nDetected Pipelines:")

for idx, pipeline in enumerate(pipelines, start=1):

    pipeline_name = generate_pipeline_name(graph, pipeline)

    print(f"\nPipeline {idx}: {pipeline_name}")

    for stage in pipeline:
        print("  -", stage)




#renderer

from graphviz import Digraph


def render_pipeline_pdf(graph, pipelines, pipeline_names, output_path):
    """
    Renders clustered pipeline diagram into PDF.
    """

    dot = Digraph("ETL_Pipelines", format="pdf")
    dot.attr(rankdir="LR", size="8,5")

    # -----------------------------------
    # Create clusters per pipeline
    # -----------------------------------
    for idx, pipeline in enumerate(pipelines):

        cluster_name = f"cluster_{idx}"

        with dot.subgraph(name=cluster_name) as sub:

            sub.attr(label=pipeline_names[idx], style="rounded")

            for stage_name in pipeline:
                node = graph.nodes.get(stage_name)

                if not node:
                    continue

                # Build node label
                label = f"{node.name}\n({node.stage_type})\n"

                for bullet in node.summary[:4]:
                    label += f"• {bullet}\n"

                # Optional: include link file bindings
                if node.link_files:
                    label += "\nLink Files:\n"
                    for link, file in node.link_files.items():
                        label += f"{link} → {file}\n"

                sub.node(
                    node.name,
                    label=label,
                    shape="box"
                )

    # -----------------------------------
    # Add edges globally
    # -----------------------------------
    for source, target in graph.edges:
        dot.edge(source, target)

    # -----------------------------------
    # Render file
    # -----------------------------------
    dot.render(output_path, cleanup=True)


from renderer import render_pipeline_pdf

pipeline_names = []

for pipeline in pipelines:
    name = generate_pipeline_name(graph, pipeline)
    pipeline_names.append(name)


output_file = "../data/output/etl_pipeline_diagram"

render_pipeline_pdf(
    graph,
    pipelines,
    pipeline_names,
    output_file
)

print("\nPDF Generated at:")
print("../data/output/etl_pipeline_diagram.pdf")
