In [None]:
%pip install -r requirements.txt

In [None]:
import os
from openai import OpenAI
from typing import List, Dict, Optional
import numpy as np
from scipy.spatial.distance import cosine
from sentence_transformers import SentenceTransformer
from collections import Counter

In [None]:
data = [
  [
    "Click new tab",
    "Go to AWS Login Page",
    "Enter username",
    "Enter password",
    "Click Login Button",
    "Go to Services",
    "Select EC2",
    "Click Launch Instance",
    "Select AMI",
    "Choose Instance Type",
    "Click Review and Launch",
    "Click Launch"
  ],
  [
    "Open AWS Management Console",
    "Enter username",
    "Enter password",
    "Click Login Button",
    "Navigate to Services",
    "Select IAM",
    "Click on Users",
    "Choose Add User",
    "Enter User Details",
    "Attach Policies",
    "Click Create User"
  ],
  [
    "Click new tab",
    "Go to AWS Login Page",
    "Enter username",
    "Enter password",
    "Click Login Button",
    "Select Services",
    "Navigate to RDS",
    "Click Create Database",
    "Choose Database Engine",
    "Configure Settings",
    "Click Create"
  ],
  [
    "Open AWS Console",
    "Enter Login Credentials",
    "Select Services",
    "Choose Lambda",
    "Click Create Function",
    "Enter Function Name",
    "Choose Runtime",
    "Click Create",
    "Upload Code"
  ],
  [
    "Go to AWS Console",
    "Enter username",
    "Enter password",
    "Select Services",
    "Go to CloudFormation",
    "Click Create Stack",
    "Upload Template File",
    "Click Next",
    "Configure Stack Settings",
    "Click Create Stack"
  ],
  [
    "Open AWS Login Page",
    "Enter username",
    "Enter password",
    "Click Login Button",
    "Navigate to S3",
    "Click Create Bucket",
    "Enter Bucket Name",
    "Choose Region",
    "Click Create"
  ],
  [
    "Open AWS Console",
    "Enter username",
    "Enter password",
    "Go to Services",
    "Select CloudWatch",
    "Click Create Alarm",
    "Choose Metric",
    "Set Conditions",
    "Click Next",
    "Configure Notifications",
    "Click Create Alarm"
  ],
  [
    "Go to AWS Console",
    "Enter Login Information",
    "Select Services",
    "Navigate to VPC",
    "Click Create VPC",
    "Enter VPC Details",
    "Click Create VPC",
    "Go to Subnets",
    "Click Create Subnet",
    "Enter Subnet Details",
    "Click Create Subnet"
  ],
  [
    "Click new tab",
    "Go to AWS Login Page",
    "Enter username",
    "Enter password",
    "Click Login Button",
    "Select Services",
    "Navigate to Elastic Beanstalk",
    "Click Create New Application",
    "Enter Application Name",
    "Choose Platform",
    "Click Create"
  ],
  [
    "Open AWS Console",
    "Enter username",
    "Enter password",
    "Click Login Button",
    "Select Services",
    "Go to Route 53",
    "Click Create Hosted Zone",
    "Enter Domain Name",
    "Configure Settings",
    "Click Create"
  ], 
    [
  [
    "Open AWS Console",
    "Enter username",
    "Enter password",
    "Click Login Button",
    "Navigate to Services",
    "Select S3",
    "Choose a Bucket",
    "Click Permissions Tab",
    "Edit Bucket Policy",
    "Save Changes"
  ],
  [
    "Go to AWS Login Page",
    "Enter username",
    "Enter password",
    "Click Login Button",
    "Navigate to Services",
    "Select DynamoDB",
    "Click Create Table",
    "Enter Table Name",
    "Specify Primary Key",
    "Click Create Table"
  ],
  [
    "Open AWS Console",
    "Enter Login Credentials",
    "Select Services",
    "Go to SNS",
    "Click Create Topic",
    "Enter Topic Name",
    "Choose Type",
    "Click Create Topic",
    "Copy ARN"
  ],
  [
    "Click new tab",
    "Go to AWS Login Page",
    "Enter username",
    "Enter password",
    "Click Login Button",
    "Select Services",
    "Navigate to Elastic Load Balancing",
    "Click Create Load Balancer",
    "Select Load Balancer Type",
    "Configure Settings",
    "Click Create"
  ],
  [
    "Open AWS Console",
    "Enter Login Information",
    "Navigate to Services",
    "Select ECS",
    "Click Create Cluster",
    "Select Cluster Template",
    "Configure Cluster Settings",
    "Click Create Cluster"
  ],
  [
    "Go to AWS Management Console",
    "Enter username",
    "Enter password",
    "Click Login Button",
    "Select Services",
    "Choose CodePipeline",
    "Click Create Pipeline",
    "Enter Pipeline Name",
    "Configure Stages",
    "Click Create Pipeline"
  ],
  [
    "Open AWS Console",
    "Enter username",
    "Enter password",
    "Click Login Button",
    "Navigate to Services",
    "Select AWS Backup",
    "Click Create Backup Plan",
    "Enter Plan Name",
    "Define Backup Rules",
    "Click Create Plan"
  ],
  [
    "Open AWS Console",
    "Enter Login Information",
    "Select Services",
    "Choose Systems Manager",
    "Click Automation",
    "Create New Automation",
    "Select Document Type",
    "Configure Execution Parameters",
    "Click Execute"
  ],
  [
    "Go to AWS Login Page",
    "Enter username",
    "Enter password",
    "Click Login Button",
    "Navigate to Services",
    "Select ElasticSearch",
    "Click Create Domain",
    "Enter Domain Name",
    "Configure Settings",
    "Click Create"
  ],
  [
    "Open AWS Console",
    "Enter Login Credentials",
    "Select Services",
    "Navigate to Step Functions",
    "Click Create State Machine",
    "Enter State Machine Name",
    "Define State Machine Workflow",
    "Click Create"
  ]
]
]

In [None]:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
def fuzzy_match(
    str1: str,
    str2: str,
):
    openai = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    user_prompt = (
        "You are trying to figure out whether two steps in two different action traces are the same."
    )
    user_prompt += "The first step is" + str1
    user_prompt += " The second step is" + str2
    user_prompt += """ Your response should be in the following format: YES or NO. 

Correctness: [True/False]
Reason: [Reason for the correctness/incorrectness of the agent's output]

Respond True if the agent's output is correct and contains all the relevant information as well."""
    messages = [
        {"role": "system", "content": "You are trying to figure out whether two steps in two different action traces are the same."},
        {"role": "user", "content": user_prompt},
    ]
    response = openai.chat.completions.create(
        model="gpt-4", messages=messages, max_tokens=400, temperature=0.0, stream=False
    )
    print (response.choices[0].message.content)
    return "true" in response.choices[0].message.content.lower().strip(), response.choices[0].message.content

In [None]:
def generate_embedding(text: str) -> List[float]:
    """
    Generates an embedding for the given text using a pre-trained model.
    """
    embedding = embedding_model.encode(text)
    return embedding.tolist()

In [None]:
def extract_nodes_and_edges(sequences: List[List[str]]) -> List[Dict[str, any]]:
    nodes = {}
    print("Extracting nodes and edges from sequences...")
    #create a universal start_node, and have it point to every i = 0 node 
    start_node = "START"
    nodes[start_node] = {
        "node_id": 0,
        "content": start_node,
        "next_nodes": set()
    }
    end_node = "END"
    nodes[end_node] = {}
    for seq in sequences:
        print(f"Processing sequence: {seq}")
        for i, step in enumerate(seq):
            if step not in nodes:
                nodes[step] = {
                    "node_id": f"node_{len(nodes) + 1}",
                    "content": step,
                    "next_nodes": set()
                }
                print(f"Created node: {nodes[step]['node_id']} with content: '{step}'")
            if i == 0:
                nodes[start_node]["next_nodes"].add(nodes[step]["node_id"])
                print(f"Added edge from '{nodes[start_node]['node_id']}' to '{nodes[step]['node_id']}'")
            if i < len(seq) - 1:
                next_step = seq[i + 1]
                if next_step not in nodes:
                    nodes[next_step] = {
                        "node_id": f"node_{len(nodes) + 1}",
                        "content": next_step,
                        "next_nodes": set()
                    }
                    print(f"Created node: {nodes[next_step]['node_id']} with content: '{next_step}'")
                nodes[step]["next_nodes"].add(nodes[next_step]["node_id"])
                print(f"Added edge from '{nodes[step]['node_id']}' to '{nodes[next_step]['node_id']}'")
            else:
                nodes[step]["next_nodes"].add(nodes[end_node]["node_id"])
                print(f"Added edge from '{nodes[step]['node_id']}' to '{nodes[end_node]['node_id']}'")
    nodes[end_node]["node_id"] = f"node_{len(nodes) + 1}"
    nodes[end_node]["content"] = end_node
    for node in nodes.values():
        node["next_nodes"] = list(node["next_nodes"])
    return list(nodes.values())

In [None]:
def create_embeddings_with_metadata(nodes: List[Dict[str, any]]) -> List[Dict[str, any]]:
    embedded_data = []
    print("\nGenerating embeddings for each node...")
    for item in nodes:
        node_id = item["node_id"]
        next_nodes = item["next_nodes"]
        content = item["content"]
        embedding = generate_embedding(content)
        embedded_node = {
            "node_id": node_id,
            "embedding": embedding,
            "next_nodes": next_nodes,
            "content": content
        }
        embedded_data.append(embedded_node)
        print(f"Node ID: {node_id}")
        print(f"Content: '{content}'")
        print(f"Next Nodes: {next_nodes}\n")
    return embedded_data

In [None]:
def find_nearest_node(query_embedding: List[float], embedded_data: List[Dict[str, any]]) -> Dict[str, any]:
    distances = []
    for item in embedded_data:
        distance = cosine(query_embedding, item['embedding'])
        distances.append((distance, item))
    distances.sort(key=lambda x: x[0])
    nearest_node = distances[0][1]
    print(f"\nNearest node to the query is Node ID: {nearest_node['node_id']} with content: '{nearest_node['content']}'")
    return nearest_node

In [None]:
def get_k_nearest_neighbors(current_node: Dict[str, any], embedded_data: List[Dict[str, any]], k: int) -> List[Dict[str, any]]:
    current_embedding = current_node['embedding']
    distances = []
    for item in embedded_data:
        distance = cosine(current_embedding, item['embedding'])
        distances.append((distance, item))
    distances.sort(key=lambda x: x[0])
    k_neighbors = [item for _, item in distances[:k]]
    print(f"\nK={k} nearest neighbors to Node ID: {current_node['node_id']}")
    for neighbor in k_neighbors:
        print(f"- Node ID: {neighbor['node_id']} | Content: '{neighbor['content']}'")
    return k_neighbors

In [None]:
def select_most_popular_next_node(k_neighbors: List[Dict[str, any]], embedded_data: List[Dict[str, any]]) -> Optional[Dict[str, any]]:
    next_nodes_counts = Counter()
    next_nodes_dict = {}

    print("\nCollecting outbound next nodes from k nearest neighbors...")
    for neighbor in k_neighbors:
        for next_node_id in neighbor['next_nodes']:
            next_node = next((item for item in embedded_data if item['node_id'] == next_node_id), None)
            if next_node:
                next_nodes_counts[next_node_id] += 1
                next_nodes_dict[next_node_id] = next_node

    if not next_nodes_counts:
        print("No next nodes found from k nearest neighbors.")
        return None

    print("Next nodes and their frequencies:")
    for node_id, count in next_nodes_counts.items():
        node_content = next_nodes_dict[node_id]['content']
        print(f"- Node ID: {node_id} | Content: '{node_content}' | Count: {count}")

    # Select the most popular next node (or the first if tied)
    
    #use an llm completion call to gpt-4o to select the proper next step based on the current accumulated state
    most_common = next_nodes_counts.most_common()
    highest_count = most_common[0][1]
    candidates = [node_id for node_id, count in most_common if count == highest_count]

    # Select the first candidate
    selected_node_id = candidates[0]
    selected_node = next_nodes_dict[selected_node_id]
    print(f"Selected next node: Node ID: {selected_node_id} | Content: '{selected_node['content']}'")
    return selected_node

In [None]:
def generate_full_path_sasank(query_content: str, embedded_data: List[Dict[str, any]], k: int) -> List[str]:
    print(f"\nGenerating embedding for query: '{query_content}'")
    query_embedding = generate_embedding(query_content)

    # Find the node closest to the query
    current_node = find_nearest_node(query_embedding, embedded_data)
    path = [current_node['node_id']]
    visited = set(path)

    while True:
        # Get k nearest neighbors of the current node
        k_neighbors = get_k_nearest_neighbors(current_node, embedded_data, k)

        # Select the most popular next node
        next_node = select_most_popular_next_node(k_neighbors, embedded_data)

        if not next_node or next_node['node_id'] in visited:
            print(f"\nStopping path generation. Next node: '{next_node['node_id'] if next_node else None}' | Visited: {visited}")
            break  # No more nodes to visit or loop detected

        path.append(next_node['node_id'])
        visited.add(next_node['node_id'])
        current_node = next_node

    return path

In [None]:
def pick_next_node(current_node: str, next_nodes: List[dict[str, any]], query_content:str): 
    #use an llm completion call to gpt-4o to select the proper next step based on the current accumulated state
    openai = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    user_prompt = (
        "You are trying to figure out the next step in a sequence of actions based on the query."
    )
    user_prompt += "The current step is" + current_node
    user_prompt += "The possible next steps are" + next_nodes
    user_prompt += "The query is" + query_content
    user_prompt += """Pick the next step based on the current context. Return the id of the next step as the last character 
    of your response""" 

    messages = [
        {"role": "system", "content": "You are trying to figure out the next step in a sequence of actions. Respond appropriately based on the current context."},
        {"role": "user", "content": user_prompt},
    ]
    response = openai.chat.completions.create(
        model="gpt-4", messages=messages, max_tokens=400, temperature=0.0, stream=False
    )
    return response.choices[0].message.content

In [None]:
def generate_full_path(query_content: str, embedded_data: List[Dict[str, any]], current_node: dict, k: int): 
    print(f"\nGenerating embedding for query: '{query_content}'")
    #Given that we are at current_node, iterate through all possible next_nodes, and select the one that aligns most the with the query
    while (current_node['content'] != "END"):
        next_nodes = current_node['next_nodes']
        next_node = pick_next_node(current_node['content'], next_nodes, query_content)

    




In [None]:
def node_id_to_content(node_id: str, embedded_data: List[Dict[str, any]]) -> str:
    node = next((item for item in embedded_data if item['node_id'] == node_id), None)
    return node['content'] if node else ""

In [None]:
sequences = [
[
    "Open AWS Console",
    "Enter username",
    "Enter password",
    "Click Login Button",
    "Navigate to Services",
    "Select AWS Backup",
    "Click Create Backup Plan",
    "Enter Plan Name",
    "Define Backup Rules",
    "Click Create Plan"
],
[
    "Open AWS Console",
    "Enter Login Information",
    "Select Services",
    "Choose Systems Manager",
    "Click Automation",
    "Create New Automation",
    "Select Document Type",
    "Configure Execution Parameters",
    "Click Execute"
],
[
    "Go to AWS Login Page",
    "Enter username",
    "Enter password",
    "Click Login Button",
    "Navigate to Services",
    "Select ElasticSearch",
    "Click Create Domain",
    "Enter Domain Name",
    "Configure Settings",
    "Click Create"
],
[
    "Open AWS Console",
    "Enter username",
    "Enter password",
    "Click Login Button",
    "Navigate to Services",
    "Select S3",
    "Click Create Bucket",
    "Enter Bucket Name",
    "Configure Bucket Settings",
    "Click Create Bucket"
],
]
nodes = extract_nodes_and_edges(sequences)
print(nodes)

In [None]:
embedded_data = create_embeddings_with_metadata(nodes)
print(embedded_data)

In [None]:
query_content = "Create AWS bucket"
k = 3  # Number of nearest neighbors to consider

# Generate full path based on the query
path_node_ids = generate_full_path(query_content, embedded_data, k)

# Convert node IDs back to content for readability
path_contents = [node_id_to_content(node_id, embedded_data) for node_id in path_node_ids]

print("\nGenerated Path:")
for step in path_contents:
    print(f"- {step}")