<a href="https://colab.research.google.com/github/sr606/LLM/blob/main/mermaid_trail_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install graphviz



In [2]:
import re
import json
from graphviz import Digraph
from collections import defaultdict, deque


# =====================================================
# 1Ô∏è‚É£ CLASSIFY STAGE TYPE
# =====================================================

def classify_layer(stage_type):
    st = stage_type.upper()

    if "TRANSFORMER" in st:
        return "TRANSFORM"

    if "HASHED" in st:
        return "HASH"

    if "SEQ" in st:
        return "FILE"

    if "ORACLE" in st or "CUSTOM" in st:
        return "DB"

    return "OTHER"


# =====================================================
# 2Ô∏è‚É£ PARSE PSEUDOCODE INTO METADATA
# =====================================================

def parse_pseudocode(text):

    stages = {}
    dataset_producers = {}

    stage_pattern = r"--- \[(.*?) : (.*?)\]"
    input_pattern = r"Input:\s*‚Üê\s*(dataset_\d+)"
    output_pattern = r"Output:\s*‚Üí\s*(dataset_\d+)"
    stagevar_pattern = r"StageVar (.*)"
    constraint_pattern = r"Constraint \((.*?)\): (.*)"
    table_pattern = r"\bFROM\s+([\w\.]+)|\bJOIN\s+([\w\.]+)"

    current_stage = None
    collecting_sql = False

    for line in text.splitlines():

        stage_match = re.search(stage_pattern, line)
        if stage_match:
            stage_type = stage_match.group(1).strip()
            stage_name = stage_match.group(2).strip()

            current_stage = stage_name
            collecting_sql = False

            stages[current_stage] = {
                "type": stage_type,
                "layer": classify_layer(stage_type),
                "inputs": [],
                "outputs": [],
                "tables": [],
                "stagevars": [],
                "constraints": [],
                "joins": 0,
                "logic_complexity": 0
            }
            continue

        if current_stage:

            # Detect SQL start
            if "SQL:" in line:
                collecting_sql = True
                continue

            # Collect SQL lines
            if collecting_sql:
                tables = re.findall(table_pattern, line, re.IGNORECASE)
                for t in tables:
                    for table in t:
                        if table:
                            stages[current_stage]["tables"].append(table)

                if "JOIN" in line.upper():
                    stages[current_stage]["joins"] += 1

                if "CASE" in line.upper() or "IF" in line.upper():
                    stages[current_stage]["logic_complexity"] += 1

            # Inputs
            input_match = re.search(input_pattern, line)
            if input_match:
                stages[current_stage]["inputs"].append(input_match.group(1))

            # Outputs
            output_match = re.search(output_pattern, line)
            if output_match:
                dataset = output_match.group(1)
                stages[current_stage]["outputs"].append(dataset)
                dataset_producers[dataset] = current_stage

            # Stage variables
            stagevar_match = re.search(stagevar_pattern, line)
            if stagevar_match:
                stages[current_stage]["stagevars"].append(stagevar_match.group(1))

            # Constraints
            constraint_match = re.search(constraint_pattern, line)
            if constraint_match:
                stages[current_stage]["constraints"].append(constraint_match.group(2))

    return stages, dataset_producers


# =====================================================
# 3Ô∏è‚É£ COMPUTE EXECUTION DEPTH
# =====================================================

def compute_depths(stages, dataset_producers):

    graph = defaultdict(list)
    indegree = defaultdict(int)

    for stage, info in stages.items():
        for dataset in info["inputs"]:
            if dataset in dataset_producers:
                parent = dataset_producers[dataset]
                graph[parent].append(stage)
                indegree[stage] += 1

    depth = {}
    queue = deque()

    for stage in stages:
        if indegree[stage] == 0:
            queue.append(stage)
            depth[stage] = 0

    while queue:
        node = queue.popleft()
        for neighbor in graph[node]:
            indegree[neighbor] -= 1
            if indegree[neighbor] == 0:
                depth[neighbor] = depth[node] + 1
                queue.append(neighbor)

    return depth


# =====================================================
# 4Ô∏è‚É£ BUILD GRAPHVIZ FROM METADATA
# =====================================================

def build_graphviz(stages, dataset_producers):

    dot = Digraph("ETL_Flow", engine="dot")
    dot.attr(rankdir="LR", splines="spline", nodesep="0.8", ranksep="1.2")

    dot.attr("node",
             shape="box",
             style="rounded,filled",
             width="3",
             height="1")

    depths = compute_depths(stages, dataset_producers)

    # Detect final targets
    outgoing = defaultdict(int)
    for stage, info in stages.items():
        for dataset in info["inputs"]:
            if dataset in dataset_producers:
                parent = dataset_producers[dataset]
                outgoing[parent] += 1

    targets = [s for s in stages if outgoing[s] == 0]

    max_depth = max(depths.values())
    for t in targets:
        if stages[t]["layer"] == "DB":
            depths[t] = max_depth + 1

    # Group by depth
    levels = defaultdict(list)
    for stage, d in depths.items():
        levels[d].append(stage)

    # Create nodes
    for d in sorted(levels):
        with dot.subgraph() as s:
            s.attr(rank="same")
            for stage in levels[d]:
                info = stages[stage]

                label = f"{stage}\n"
                label += f"Type: {info['type']}\n"
                label += f"In: {len(info['inputs'])} | Out: {len(info['outputs'])}\n"

                if info["tables"]:
                    label += f"Tables: {len(set(info['tables']))}\n"

                if info["joins"] > 0:
                    label += f"Joins: {info['joins']}\n"

                if info["stagevars"]:
                    label += f"StageVars: {len(info['stagevars'])}\n"

                if info["constraints"]:
                    label += f"Constraints: {len(info['constraints'])}"

                color = "#F4F6F7"
                if info["layer"] == "DB":
                    color = "#AED6F1"
                elif info["layer"] == "TRANSFORM":
                    color = "#F9E79F"
                elif info["layer"] == "HASH":
                    color = "#ABEBC6"
                elif info["layer"] == "FILE":
                    color = "#F5B7B1"

                s.node(stage, label, fillcolor=color)

    # Add edges
    for stage, info in stages.items():
        for dataset in info["inputs"]:
            if dataset in dataset_producers:
                parent = dataset_producers[dataset]

                if stages[parent]["layer"] == "HASH":
                    dot.edge(parent, stage,
                             constraint="false",
                             color="gray")
                else:
                    dot.edge(parent, stage)

    return dot


# =====================================================
# 5Ô∏è‚É£ RUN + EXPORT
# =====================================================

with open("Samle_Job2 1 2_detailed_pseudocode.txt", "r") as f:
    pseudo_text = f.read()

stages, producers = parse_pseudocode(pseudo_text)

# Export JSON metadata
metadata = {
    "stages": stages,
    "dependencies": producers
}

with open("etl_metadata.json", "w") as f:
    json.dump(metadata, f, indent=4)

# Build diagram
graph = build_graphviz(stages, producers)
graph.render("etl_flow_detailed", format="pdf")

print("‚úÖ JSON exported as etl_metadata.json")
print("‚úÖ Diagram generated as etl_flow_detailed.pdf")


‚úÖ JSON exported as etl_metadata.json
‚úÖ Diagram generated as etl_flow_detailed.pdf


In [5]:
import re
import json
from collections import defaultdict


# ==========================================================
# 1Ô∏è‚É£ CLASSIFY STAGE LAYER
# ==========================================================

def classify_layer(stage_type):
    st = stage_type.upper()

    if "TRANSFORMER" in st:
        return "Transform"

    if "HASHED" in st:
        return "Intermediate"

    if "SEQ" in st:
        return "Outputs"

    if "ORACLE" in st or "CUSTOM" in st:
        return "Source_Target"

    return "Other"


# ==========================================================
# 2Ô∏è‚É£ PARSE PSEUDOCODE
# ==========================================================

def parse_pseudocode(text):

    stages = {}
    dataset_producers = {}

    stage_pattern = r"--- \[(.*?) : (.*?)\]"
    input_pattern = r"Input:\s*‚Üê\s*(dataset_\d+)\s*\((.*?)\)"
    output_pattern = r"Output:\s*‚Üí\s*(dataset_\d+)\s*\((.*?)\)"
    stagevar_pattern = r"StageVar (.*)"
    constraint_pattern = r"Constraint \((.*?)\): (.*)"
    table_pattern = r"\bFROM\s+([\w\.]+)|\bJOIN\s+([\w\.]+)"

    current_stage = None
    collecting_sql = False

    for line in text.splitlines():

        stage_match = re.search(stage_pattern, line)
        if stage_match:
            stage_type = stage_match.group(1).strip()
            stage_name = stage_match.group(2).strip()

            current_stage = stage_name
            collecting_sql = False

            stages[current_stage] = {
                "type": stage_type,
                "layer": classify_layer(stage_type),
                "inputs": [],
                "outputs": [],
                "stagevars": [],
                "constraints": [],
                "tables": [],
                "joins": 0,
                "business_rules": []
            }
            continue

        if not current_stage:
            continue

        if "SQL:" in line:
            collecting_sql = True
            continue

        if collecting_sql:
            tables = re.findall(table_pattern, line, re.IGNORECASE)
            for t in tables:
                for table in t:
                    if table:
                        stages[current_stage]["tables"].append(table)

            if "JOIN" in line.upper():
                stages[current_stage]["joins"] += 1

        input_match = re.search(input_pattern, line)
        if input_match:
            dataset = input_match.group(1)
            dataset_name = input_match.group(2)
            stages[current_stage]["inputs"].append((dataset, dataset_name))

        output_match = re.search(output_pattern, line)
        if output_match:
            dataset = output_match.group(1)
            dataset_name = output_match.group(2)
            stages[current_stage]["outputs"].append((dataset, dataset_name))
            dataset_producers[dataset] = current_stage

        stagevar_match = re.search(stagevar_pattern, line)
        if stagevar_match:
            rule = stagevar_match.group(1).strip()
            stages[current_stage]["stagevars"].append(rule)
            stages[current_stage]["business_rules"].append(rule)

        constraint_match = re.search(constraint_pattern, line)
        if constraint_match:
            rule = constraint_match.group(2).strip()
            stages[current_stage]["constraints"].append(rule)
            stages[current_stage]["business_rules"].append(rule)

        if " IF " in line.upper():
            stages[current_stage]["business_rules"].append(line.strip())

    return stages, dataset_producers


# ==========================================================
# 3Ô∏è‚É£ GENERATE MERMAID CODE
# ==========================================================

def generate_mermaid(stages, producers):

    lines = []
    lines.append("flowchart LR")

    layers = defaultdict(list)
    for stage, info in stages.items():
        layers[info["layer"]].append(stage)

    for layer, stage_list in layers.items():
        lines.append(f"  subgraph {layer}")
        for stage in stage_list:
            info = stages[stage]

            label = f"{info['type']}: {stage}"
            if info["joins"] > 0:
                label += f"<br/>Joins: {info['joins']}"
            if info["stagevars"]:
                label += f"<br/>StageVars: {len(info['stagevars'])}"
            if info["constraints"]:
                label += f"<br/>Constraints: {len(info['constraints'])}"

            lines.append(f'    {stage}["{label}"]')
        lines.append("  end\n")

    for stage, info in stages.items():
        for dataset, dataset_name in info["inputs"]:
            parent = producers.get(dataset)
            if parent:
                lines.append(
                    f'  {parent} -- "{dataset}: {dataset_name}" --> {stage}'
                )

    return "\n".join(lines)


# ==========================================================
# 4Ô∏è‚É£ EXPORT TO PDF-READY HTML
# ==========================================================

with open("Sample_Job1 1 2_detailed_pseudocode.txt", "r") as f:
    pseudo_text = f.read()

stages, producers = parse_pseudocode(pseudo_text)
mermaid_code = generate_mermaid(stages, producers)

html_template = f"""
<!DOCTYPE html>
<html>
<head>
  <script src="https://cdn.jsdelivr.net/npm/mermaid/dist/mermaid.min.js"></script>
  <script>mermaid.initialize({{ startOnLoad: true }});</script>
</head>
<body>
<div class="mermaid">
{mermaid_code}
</div>
</body>
</html>
"""

with open("etl_flow.html", "w") as f:
    f.write(html_template)

print("HTML generated ‚Üí etl_flow.html")


‚úÖ HTML generated ‚Üí etl_flow.html
üëâ Open in browser ‚Üí Print ‚Üí Save as PDF
