In [None]:
pip install openai==0.28

In [None]:
import os
import json
import warnings
from lxml import etree
from sklearn.cluster import AgglomerativeClustering
from sentence_transformers import SentenceTransformer
import numpy as np
from collections import defaultdict
import openai

warnings.filterwarnings('ignore')

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Set OpenAI API key
openai.api_key = "openai-api-key"  # Replace with your OpenAI API key

# Paths to files on Google Drive
base_dir = "/content/drive/My Drive/SpecFiles/"  # Replace with your directory path
bpmn_file_paths = [
    os.path.join(base_dir, "Onboarding_MGMT_End_Start_Error_with_Adjusted_Layout.bpmn"),
    os.path.join(base_dir, "Onboarding_MGMT_Incorrect_Link_with_Diagram_Resolved.bpmn"),
    os.path.join(base_dir, "Onboarding_MGMT_Logical_Error_with_Diagram.bpmn"),
    os.path.join(base_dir, "Onboarding_MGMT_Task_Error_with_Diagram.bpmn"),
    os.path.join(base_dir, "Onboarding_MGMT_Task_Error.bpmn"),
]
json_file_path = os.path.join(base_dir, "onboarding_mgmt_bpmn_annotations.json")

# Output paths
report_file_path = os.path.join(base_dir, "fault_detection_report.txt")
solutions_file_path = os.path.join(base_dir, "fault_solutions.txt")
corrected_bpmn_dir = os.path.join(base_dir, "corrected_bpmn/")
os.makedirs(corrected_bpmn_dir, exist_ok=True)

# Load embedding model
embedding_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Helper: Map entity types to valid BPMN element types
def map_entity_type(entity_type):
    type_mapping = {
        'Start Event': 'startEvent',
        'End Event': 'endEvent',
        'Task': 'task',
        'User Task': 'userTask',
        'Exclusive Gateway': 'exclusiveGateway',
        # Add other mappings as needed
    }
    # Return mapped type or sanitize the entity_type
    return type_mapping.get(entity_type, entity_type.replace(' ', '').lower())

# Helper: Extract BPMN entities from files
def extract_entities_from_bpmn(bpmn_file_paths):
    bpmn_entities = {}
    for bpmn_file in bpmn_file_paths:
        try:
            print(f"Parsing BPMN file: {bpmn_file}")
            tree = etree.parse(bpmn_file)
            root = tree.getroot()
            ns = {"bpmn": "http://www.omg.org/spec/BPMN/20100524/MODEL"}
            entities = []
            # List of tag locals to extract
            tags_to_extract = ["task", "userTask", "startEvent", "endEvent", "exclusiveGateway"]
            for element in root.xpath(".//*", namespaces=ns):
                tag_local = etree.QName(element).localname
                if tag_local in tags_to_extract:
                    entity_name = element.get("name")
                    if entity_name:
                        entities.append({
                            "id": element.get("id"),
                            "name": entity_name,
                            "type": tag_local,
                        })
            bpmn_entities[os.path.basename(bpmn_file)] = entities
            print(f"Extracted {len(entities)} entities from {bpmn_file}")
        except Exception as e:
            print(f"Error parsing BPMN file {bpmn_file}: {e}")
    return bpmn_entities

# Helper: Generate embeddings for entities
def generate_entity_embeddings(entities):
    embeddings = []
    for entity in entities:
        if "name" in entity and entity["name"]:
            embedding = embedding_model.encode(entity["name"])
            embeddings.append({
                "id": entity["id"],
                "name": entity["name"],
                "type": entity["type"],
                "embedding": embedding
            })
    return embeddings

# Helper: Detect faults using clustering
def detect_faults(reference_entities, bpmn_entities):
    embeddings = [
        {"source": "reference", **entity} for entity in reference_entities
    ] + [
        {"source": "bpmn", **entity} for entity in bpmn_entities
    ]
    if not embeddings:
        return []
    embedding_vectors = np.array([e["embedding"] for e in embeddings])
    clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=1.0).fit(embedding_vectors)
    for idx, cluster_id in enumerate(clustering.labels_):
        embeddings[idx]["cluster"] = cluster_id
    clusters = defaultdict(list)
    for embedding in embeddings:
        clusters[embedding["cluster"]].append(embedding)

    faults = []
    for cluster_id, members in clusters.items():
        reference_members = [m for m in members if m["source"] == "reference"]
        bpmn_members = [m for m in members if m["source"] == "bpmn"]
        if not reference_members and bpmn_members:
            faults.append({"type": "redundant", "cluster": cluster_id, "entities": bpmn_members})
        elif not bpmn_members and reference_members:
            faults.append({"type": "missing", "cluster": cluster_id, "entities": reference_members})
    return faults

# Helper: Generate solutions for faults using OpenAI
def generate_solutions_for_faults(faults):
    solutions = []
    for fault in faults:
        fault_type = "redundant steps" if fault["type"] == "redundant" else "missing steps"
        entities_text = "\n".join([e["name"] for e in fault["entities"]])
        prompt = f"""
You are an expert in BPMN. Analyze the following fault and suggest corrections:
Fault Type: {fault_type}
Entities Involved:
{entities_text}
Provide detailed steps to correct the issue.
"""
        try:
            response = openai.ChatCompletion.create(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": "You are a BPMN expert."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=300,
                temperature=0.7,
            )
            solution = response.choices[0].message.content.strip()
            solutions.append({"fault": fault, "solution": solution})
            print(f"Generated solution for fault type '{fault_type}'")
        except Exception as e:
            print(f"Error generating solution for fault {fault}: {e}")
    return solutions

# Helper: Apply corrections to BPMN files
def apply_corrections_to_bpmn(file_name, entities, faults, base_dir, output_dir):
    print(f"Applying corrections to BPMN file: {file_name}")
    tree = etree.parse(os.path.join(base_dir, file_name))
    root = tree.getroot()
    ns = {
        "bpmn": "http://www.omg.org/spec/BPMN/20100524/MODEL",
        "bpmndi": "http://www.omg.org/spec/BPMN/20100524/DI",
        "dc": "http://www.omg.org/spec/DD/20100524/DC",
        "di": "http://www.omg.org/spec/DD/20100524/DI",
    }

    # Build a map of elements by ID for easy lookup
    element_by_id = {element.get("id"): element for element in root.xpath(".//*[@id]", namespaces=ns)}
    # Map of BPMNShape by bpmnElement
    shape_by_bpmnElement = {shape.get("bpmnElement"): shape for shape in root.xpath(".//bpmndi:BPMNShape", namespaces=ns)}
    # Map of BPMNEdge by bpmnElement
    edge_by_bpmnElement = {edge.get("bpmnElement"): edge for edge in root.xpath(".//bpmndi:BPMNEdge", namespaces=ns)}

    process = root.find(".//bpmn:process", namespaces=ns)
    if process is None:
        raise ValueError(f"No process element found in {file_name}")

    plane = root.find(".//bpmndi:BPMNPlane", namespaces=ns)
    if plane is None:
        diagram = root.find(".//bpmndi:BPMNDiagram", namespaces=ns)
        if diagram is None:
            diagram = etree.SubElement(
                root, "{http://www.omg.org/spec/BPMN/20100524/DI}BPMNDiagram"
            )
            diagram.set("id", "BPMNDiagram_1")
        plane = etree.SubElement(
            diagram, "{http://www.omg.org/spec/BPMN/20100524/DI}BPMNPlane"
        )
        plane.set("id", "BPMNPlane_1")
        plane.set("bpmnElement", process.get("id"))
    else:
        diagram = plane.getparent()

    for fault in faults:
        if fault["type"] == "redundant":
            for entity in fault["entities"]:
                element = element_by_id.get(entity["id"])
                if element is not None:
                    # Collect flows and edges to remove
                    flows_to_remove = []
                    edges_to_remove = []
                    # Incoming flows
                    incoming_flows = root.xpath(f".//bpmn:sequenceFlow[@targetRef='{entity['id']}']", namespaces=ns)
                    # Outgoing flows
                    outgoing_flows = root.xpath(f".//bpmn:sequenceFlow[@sourceRef='{entity['id']}']", namespaces=ns)
                    all_flows = incoming_flows + outgoing_flows
                    for flow in all_flows:
                        flows_to_remove.append(flow)
                        edge = edge_by_bpmnElement.get(flow.get("id"))
                        if edge is not None:
                            edges_to_remove.append(edge)

                    # Remove edges
                    for edge in edges_to_remove:
                        if edge.getparent() is not None:
                            edge.getparent().remove(edge)
                            print(f"Removed BPMNEdge: {edge.get('id')}")

                    # Remove flows
                    for flow in flows_to_remove:
                        if flow.getparent() is not None:
                            flow.getparent().remove(flow)
                            print(f"Removed sequenceFlow: {flow.get('id')}")

                    # Remove the element
                    if element.getparent() is not None:
                        element.getparent().remove(element)
                        del element_by_id[entity["id"]]
                        print(f"Removed element: {entity['id']}")

                    # Remove BPMNShape
                    shape = shape_by_bpmnElement.get(entity["id"])
                    if shape is not None and shape.getparent() is not None:
                        shape.getparent().remove(shape)
                        print(f"Removed BPMNShape: {shape.get('id')}")
        elif fault["type"] == "missing":
            for entity in fault["entities"]:
                # Check if the element already exists
                if entity["id"] in element_by_id:
                    print(f"Element {entity['id']} already exists. Skipping.")
                    continue  # Already exists
                # Map entity['type'] to valid BPMN tag name
                element_type = map_entity_type(entity['type'])
                element_tag = f"{{http://www.omg.org/spec/BPMN/20100524/MODEL}}{element_type}"
                try:
                    new_element = etree.Element(element_tag)
                except ValueError as ve:
                    print(f"Invalid BPMN tag for type '{element_type}': {ve}")
                    continue
                new_element.set("id", entity["id"])
                new_element.set("name", entity["name"])
                process.append(new_element)
                element_by_id[entity["id"]] = new_element
                print(f"Added new element: {entity['id']} ({element_type})")

                # Create BPMNShape for the element
                bpmn_shape = etree.Element(
                    "{http://www.omg.org/spec/BPMN/20100524/DI}BPMNShape"
                )
                bpmn_shape.set("id", f"{entity['id']}_di")
                bpmn_shape.set("bpmnElement", entity["id"])
                bounds = etree.SubElement(
                    bpmn_shape, "{http://www.omg.org/spec/DD/20100524/DC}Bounds"
                )
                # Position the element
                num_existing_elements = len(shape_by_bpmnElement)
                x_coord = 100 + 150 * num_existing_elements
                if element_type in ['startEvent', 'endEvent']:
                    bounds.set("width", "36")
                    bounds.set("height", "36")
                else:
                    bounds.set("width", "100")
                    bounds.set("height", "80")
                bounds.set("x", str(x_coord))
                bounds.set("y", "200")  # Fixed y-coordinate
                plane.append(bpmn_shape)
                shape_by_bpmnElement[entity["id"]] = bpmn_shape
                print(f"Added BPMNShape: {bpmn_shape.get('id')}")

                # Handle sequence flows
                if element_type == 'startEvent':
                    # Start events have no incoming flows
                    # Connect to the first task if it exists
                    tasks = process.xpath(".//bpmn:task | .//bpmn:userTask", namespaces=ns)
                    if tasks:
                        target_id = tasks[0].get("id")
                        flow_id = f"Flow_{entity['id']}_{target_id}"
                        new_flow = etree.Element(
                            "{http://www.omg.org/spec/BPMN/20100524/MODEL}sequenceFlow"
                        )
                        new_flow.set("id", flow_id)
                        new_flow.set("sourceRef", entity["id"])
                        new_flow.set("targetRef", target_id)
                        process.append(new_flow)
                        edge_by_bpmnElement[flow_id] = new_flow
                        print(f"Added sequenceFlow: {flow_id}")

                        # Add BPMNEdge for the sequence flow
                        bpmn_edge = etree.Element(
                            "{http://www.omg.org/spec/BPMN/20100524/DI}BPMNEdge"
                        )
                        bpmn_edge.set("id", f"{flow_id}_di")
                        bpmn_edge.set("bpmnElement", flow_id)

                        # Set waypoints
                        source_shape = shape_by_bpmnElement.get(entity["id"])
                        target_shape = shape_by_bpmnElement.get(target_id)
                        if source_shape is not None and target_shape is not None:
                            source_bounds = source_shape.find(".//dc:Bounds", namespaces=ns)
                            target_bounds = target_shape.find(".//dc:Bounds", namespaces=ns)
                            if source_bounds is not None and target_bounds is not None:
                                x1 = float(source_bounds.get("x")) + float(source_bounds.get("width")) / 2
                                y1 = float(source_bounds.get("y")) + float(source_bounds.get("height")) / 2
                                x2 = float(target_bounds.get("x")) + float(target_bounds.get("width")) / 2
                                y2 = float(target_bounds.get("y")) + float(target_bounds.get("height")) / 2
                                waypoint_1 = etree.SubElement(
                                    bpmn_edge, "{http://www.omg.org/spec/DD/20100524/DI}waypoint"
                                )
                                waypoint_1.set("x", str(x1))
                                waypoint_1.set("y", str(y1))
                                waypoint_2 = etree.SubElement(
                                    bpmn_edge, "{http://www.omg.org/spec/DD/20100524/DI}waypoint"
                                )
                                waypoint_2.set("x", str(x2))
                                waypoint_2.set("y", str(y2))
                                plane.append(bpmn_edge)
                                print(f"Added BPMNEdge with waypoints: {bpmn_edge.get('id')}")
                elif element_type == 'endEvent':
                    # End events have no outgoing flows
                    # Connect from the last task if it exists
                    tasks = process.xpath(".//bpmn:task | .//bpmn:userTask", namespaces=ns)
                    if tasks:
                        source_id = tasks[-1].get("id")
                        flow_id = f"Flow_{source_id}_{entity['id']}"
                        new_flow = etree.Element(
                            "{http://www.omg.org/spec/BPMN/20100524/MODEL}sequenceFlow"
                        )
                        new_flow.set("id", flow_id)
                        new_flow.set("sourceRef", source_id)
                        new_flow.set("targetRef", entity["id"])
                        process.append(new_flow)
                        edge_by_bpmnElement[flow_id] = new_flow
                        print(f"Added sequenceFlow: {flow_id}")

                        # Add BPMNEdge for the sequence flow
                        bpmn_edge = etree.Element(
                            "{http://www.omg.org/spec/BPMN/20100524/DI}BPMNEdge"
                        )
                        bpmn_edge.set("id", f"{flow_id}_di")
                        bpmn_edge.set("bpmnElement", flow_id)

                        # Set waypoints
                        source_shape = shape_by_bpmnElement.get(source_id)
                        target_shape = shape_by_bpmnElement.get(entity["id"])
                        if source_shape is not None and target_shape is not None:
                            source_bounds = source_shape.find(".//dc:Bounds", namespaces=ns)
                            target_bounds = target_shape.find(".//dc:Bounds", namespaces=ns)
                            if source_bounds is not None and target_bounds is not None:
                                x1 = float(source_bounds.get("x")) + float(source_bounds.get("width")) / 2
                                y1 = float(source_bounds.get("y")) + float(source_bounds.get("height")) / 2
                                x2 = float(target_bounds.get("x")) + float(target_bounds.get("width")) / 2
                                y2 = float(target_bounds.get("y")) + float(target_bounds.get("height")) / 2
                                waypoint_1 = etree.SubElement(
                                    bpmn_edge, "{http://www.omg.org/spec/DD/20100524/DI}waypoint"
                                )
                                waypoint_1.set("x", str(x1))
                                waypoint_1.set("y", str(y1))
                                waypoint_2 = etree.SubElement(
                                    bpmn_edge, "{http://www.omg.org/spec/DD/20100524/DI}waypoint"
                                )
                                waypoint_2.set("x", str(x2))
                                waypoint_2.set("y", str(y2))
                                plane.append(bpmn_edge)
                                print(f"Added BPMNEdge with waypoints: {bpmn_edge.get('id')}")
                else:
                    # For tasks and other elements
                    # Determine sourceRef
                    # Find the previous element to connect from
                    previous_elements = [elem_id for elem_id in shape_by_bpmnElement if elem_id != entity['id']]
                    if previous_elements:
                        source_id = previous_elements[-1]
                    else:
                        # If no previous elements, add a startEvent
                        start_event_id = "StartEvent_1"
                        if start_event_id not in element_by_id:
                            start_event = etree.Element("{http://www.omg.org/spec/BPMN/20100524/MODEL}startEvent")
                            start_event.set("id", start_event_id)
                            process.insert(0, start_event)
                            element_by_id[start_event_id] = start_event
                            # Add BPMNShape for start event
                            start_shape = etree.Element("{http://www.omg.org/spec/BPMN/20100524/DI}BPMNShape")
                            start_shape.set("id", f"{start_event_id}_di")
                            start_shape.set("bpmnElement", start_event_id)
                            start_bounds = etree.SubElement(start_shape, "{http://www.omg.org/spec/DD/20100524/DC}Bounds")
                            start_bounds.set("x", "50")
                            start_bounds.set("y", "200")
                            start_bounds.set("width", "36")
                            start_bounds.set("height", "36")
                            plane.append(start_shape)
                            shape_by_bpmnElement[start_event_id] = start_shape
                            print(f"Added startEvent: {start_event_id}")
                        source_id = start_event_id

                    # Now add sequence flow
                    flow_id = f"Flow_{source_id}_{entity['id']}"
                    new_flow = etree.Element(
                        "{http://www.omg.org/spec/BPMN/20100524/MODEL}sequenceFlow"
                    )
                    new_flow.set("id", flow_id)
                    new_flow.set("sourceRef", source_id)
                    new_flow.set("targetRef", entity["id"])
                    process.append(new_flow)
                    edge_by_bpmnElement[flow_id] = new_flow
                    print(f"Added sequenceFlow: {flow_id}")

                    # Add BPMNEdge for the sequence flow
                    bpmn_edge = etree.Element(
                        "{http://www.omg.org/spec/BPMN/20100524/DI}BPMNEdge"
                    )
                    bpmn_edge.set("id", f"{flow_id}_di")
                    bpmn_edge.set("bpmnElement", flow_id)

                    # Set waypoints
                    source_shape = shape_by_bpmnElement.get(source_id)
                    target_shape = shape_by_bpmnElement.get(entity["id"])
                    if source_shape is not None and target_shape is not None:
                        source_bounds = source_shape.find(".//dc:Bounds", namespaces=ns)
                        target_bounds = target_shape.find(".//dc:Bounds", namespaces=ns)
                        if source_bounds is not None and target_bounds is not None:
                            x1 = float(source_bounds.get("x")) + float(source_bounds.get("width")) / 2
                            y1 = float(source_bounds.get("y")) + float(source_bounds.get("height")) / 2
                            x2 = float(target_bounds.get("x")) + float(target_bounds.get("width")) / 2
                            y2 = float(target_bounds.get("y")) + float(target_bounds.get("height")) / 2
                            waypoint_1 = etree.SubElement(
                                bpmn_edge, "{http://www.omg.org/spec/DD/20100524/DI}waypoint"
                            )
                            waypoint_1.set("x", str(x1))
                            waypoint_1.set("y", str(y1))
                            waypoint_2 = etree.SubElement(
                                bpmn_edge, "{http://www.omg.org/spec/DD/20100524/DI}waypoint"
                            )
                            waypoint_2.set("x", str(x2))
                            waypoint_2.set("y", str(y2))
                            plane.append(bpmn_edge)
                            print(f"Added BPMNEdge with waypoints: {bpmn_edge.get('id')}")

    corrected_path = os.path.join(output_dir, f"corrected_{file_name}")
    tree.write(corrected_path, pretty_print=True, xml_declaration=True, encoding="UTF-8")
    print(f"Saved corrected BPMN file: {corrected_path}")
    return corrected_path

# Main execution
def main():
    # Load reference data
    try:
        with open(json_file_path, "r") as json_file:
            reference_data = json.load(json_file)
        print(f"Loaded reference data from {json_file_path}")
    except Exception as e:
        print(f"Error loading JSON file {json_file_path}: {e}")
        return

    # Extract entities from BPMN files
    bpmn_entities_dict = extract_entities_from_bpmn(bpmn_file_paths)

    # For each BPMN file, generate fault reports and corrections
    all_reports = []
    all_solutions = []
    for file_name in bpmn_entities_dict:
        print(f"\nProcessing BPMN file: {file_name}")
        entities = bpmn_entities_dict[file_name]
        reference_entities = reference_data.get(file_name, [])
        # Map entity types in reference data
        for entity in reference_entities:
            original_type = entity.get('type', '')
            mapped_type = map_entity_type(original_type)
            entity['type'] = mapped_type
            print(f"Mapped reference entity type: '{original_type}' to '{mapped_type}'")

        reference_embeddings = generate_entity_embeddings(reference_entities)
        bpmn_embeddings = generate_entity_embeddings(entities)
        faults = detect_faults(reference_embeddings, bpmn_embeddings)
        print(f"Detected {len(faults)} faults in {file_name}")
        solutions = generate_solutions_for_faults(faults)
        all_reports.append({"file_name": file_name, "faults": faults})
        all_solutions.append({"file_name": file_name, "solutions": solutions})

        # Apply corrections to the current BPMN file
        corrected_file = apply_corrections_to_bpmn(
            file_name, entities, faults, base_dir, corrected_bpmn_dir
        )
        print(f"Corrected BPMN file saved: {corrected_file}")

    # Save reports
    try:
        with open(report_file_path, "w") as report_file:
            for report in all_reports:
                report_file.write(f"File: {report['file_name']}\n")
                for fault in report["faults"]:
                    report_file.write(f"  Fault Type: {fault['type']}\n")
                    report_file.write(f"  Entities: {', '.join([e['name'] for e in fault['entities']])}\n")
                report_file.write("\n")
        print(f"\nSaved fault detection report to {report_file_path}")
    except Exception as e:
        print(f"Error writing report file {report_file_path}: {e}")

    # Save solutions
    try:
        with open(solutions_file_path, "w") as solutions_file:
            for solution_data in all_solutions:
                file_name = solution_data["file_name"]
                solutions = solution_data["solutions"]
                solutions_file.write(f"File: {file_name}\n")
                for solution in solutions:
                    fault_type = solution['fault']['type']
                    entities_involved = ', '.join([e['name'] for e in solution['fault']['entities']])
                    solutions_file.write(f"  Fault Type: {fault_type}\n")
                    solutions_file.write(f"  Entities Involved: {entities_involved}\n")
                    solutions_file.write(f"  Solution: {solution['solution']}\n\n")
        print(f"Saved fault solutions to {solutions_file_path}")
    except Exception as e:
        print(f"Error writing solutions file {solutions_file_path}: {e}")

if __name__ == "__main__":
    main()
