# RAG development  using SBERT

### Model setup

#### Installing sbert transformers

In [1]:
%pip install -U sentence-transformers

Note: you may need to restart the kernel to use updated packages.


#### Setup embeddings model

In [1]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

  from tqdm.autonotebook import tqdm, trange
2024-10-07 09:41:55.046171: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-07 09:41:55.050515: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-07 09:41:55.146937: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## INDEXING

### LOAD

In [2]:
#@title Import Mind2Web dataset form huggingface
from datasets import load_dataset

# Text only dataset
ds = load_dataset("osunlp/Mind2Web")
# Multimodal dataset
#ds = load_dataset("osunlp/Multimodal-Mind2Web")

In [3]:
# Extract the train split from the dataset
train_ds = ds['train']

# Get an example task for testing
task = train_ds[1]
# Get the HTML from one of the task's actions for testing
html = task['actions'][0]['cleaned_html']
print(len(html))

42254


In [243]:
print(html)

<html backend_node_id="117">
  <body>
    <div backend_node_id="561">
      <div backend_node_id="562">
        <div backend_node_id="569">
          <div backend_node_id="570">
            <a backend_node_id="572">
                <text backend_node_id="573">Skip to main content</text>
              </a>
            <a backend_node_id="578">
                      <text backend_node_id="579">Use Tock at your business</text>
                    </a>
                  <header backend_node_id="584">
              <div backend_node_id="586">
                  <a backend_node_id="589" aria_label="Tock home page"/>
                  <div backend_node_id="597">
                    <button backend_node_id="598" aria_label="Search"/>
                    <button backend_node_id="602" aria_label="Menu"/>
                  </div>
                </div>
              <div backend_node_id="607">
                <div backend_node_id="608">
                  <div backend_node_id="610">
               

In [19]:
print('webpage: ', task['website'])
print('domain: ', task['domain'])
print('subdomain: ', task['subdomain'])

print('user_prompt: ', task['confirmed_task'])

webpage:  exploretock
domain:  Travel
subdomain:  Restaurant
user_prompt:  Book a winery tour in Napa Valley in a winery which serves Mediterranean cuisine with wine testing for 4 guests on April 15, 10 am in a outdoor setup.


## SPLITTING

In [178]:
from bs4 import BeautifulSoup
import pandas as pd

# Function to create a path with parent 'backend_node_id' attributes
def create_path(node, path=''):
    # Recursively build the path using 'backend_node_id' attributes
    if not node or not node.has_attr('backend_node_id'):
        return path
    current_node_id = node['backend_node_id']
    parent_path = create_path(node.parent, path)
    return f"{parent_path}/{current_node_id}".lstrip('/')

# Function to recursively gather text from both the current element and its children
def gather_text(element):
    # If the element itself has text, capture it
    text = element.get_text(separator=' ', strip=True) if element else ''
    return text

# Function to parse the HTML and extract paths and content for use in LAM models
def parse_html_for_rag(html_content):
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # List to store parsed data
    data = []

    # Iterate through all elements with 'backend_node_id' attributes
    for element in soup.find_all(attrs={"backend_node_id": True}):
        # Create hierarchical path using parent-child relationships
        path = create_path(element)
        
        # Gather text content from the current element and its children
        text = gather_text(element)
        
        # Check for interactive components and attributes that signify interaction
        is_interactive = element.name in ['button', 'a', 'input', 'select', 'textarea'] or element.has_attr('role')
        
        # Extract additional attributes that might indicate interactive behavior
        element_info = {
            "backend_node_id": element.get("backend_node_id"),
            "tag": element.name,
            "text": text,
            "path": path,
            "is_interactive": is_interactive,
            "role": element.get("role"),
            "aria_label": element.get("aria_label"),
        }
        
        # Include elements that are interactive or have meaningful text content
        if is_interactive or text.strip():  
            data.append(element_info)

    # Convert the data into a DataFrame
    df = pd.DataFrame(data)

    return df

# NOTE: We could clean the text

In [232]:
parsed_task = parse_html_for_rag(html)
print(parsed_task.to_string(index=False))

backend_node_id     tag                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 

In [None]:
def flatten_and_verbalize_html(dataFrameRow):
    text = '''tag: {tag}.
    text: {text}.
    path: {path}'''
    
    return text.format(tag=dataFrameRow['tag'], text=dataFrameRow['text'], path=dataFrameRow['path'])

In [180]:
df = parsed_task.copy()
df['text_combined'] = df.apply(lambda row: f"tag:{row['tag']} path:{row['path']} text:{row['text']} is_interactive:{row['is_interactive']} role:{row['role']} aria_label:{row['aria_label']}", axis=1)


print(df['text_combined'][0])

tag:html path:117 text:Skip to main content Use Tock at your business Book a reservation Reservations Search DELICIOUS STARTS HERE. Reservation type Dine in Pickup Delivery Events Wineries Everything Location Date Time Now 11:30 AM 12:00 PM 12:30 PM 1:00 PM 1:30 PM 2:00 PM 2:30 PM 3:00 PM 3:30 PM 4:00 PM 4:30 PM 5:00 PM 5:30 PM 6:00 PM 6:30 PM 7:00 PM 7:30 PM 8:00 PM 8:30 PM 9:00 PM 9:30 PM 10:00 PM 10:30 PM 11:00 PM 11:30 PM Party size 1 guest 2 guests 3 guests 4 guests 5 guests 6 guests 7 guests 8 guests 9 guests 10 guests Search Explore all that Tock has to offer Dine in Pickup Delivery Events Wineries New & Notable The latest & greatest on Tock Explore all Explore all Agni Columbus, OH - Brewery District Grill Streetside 62 Bistro Washington Court House, OH Restaurant Hell's Backbone Grill & Farm Boulder, UT Four Corners Farm To Table Symposium Cincinnati, OH - East Walnut HIlls Wine Shop The Merchant Tavern Akron, OH - Merriman Valley American Luigi's Ristorante Italiano Mason, OH

### EMBEDDINGS

In [181]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np

# Function to create embeddings from HTML content dataframe
def generate_html_embeddings(df, embedding_model):
    # Combine the relevant text fields into a single string for embedding
    df['text_combined'] = df['text_combined'] = df.apply(lambda row: f"tag:{row['tag']} path:{row['path']} text:{row['text']} is_interactive:{row['is_interactive']} role:{row['role']} aria_label:{row['aria_label']}", axis=1)
    
    # Generate embeddings for each combined text field
    embeddings = embedding_model.encode(df['text_combined'].tolist(), convert_to_tensor=False)
    
    return embeddings

In [182]:
embeddings = generate_html_embeddings(parsed_task.copy(), model)
print(embeddings.shape)

(464, 384)


## RETRIEVAL

In [183]:
# Extract the user prompt from the task
# NOTE: 
#   1. We could use some pre-reasoning to include with the user prompt
#   2. We should use some way to enrich the query with task information like the reasoning and the current step
user_prompt = task['confirmed_task']
print(user_prompt)

Book a winery tour in Napa Valley in a winery which serves Mediterranean cuisine with wine testing for 4 guests on April 15, 10 am in a outdoor setup.


In [233]:
# Prompt embeddings
#query = "List of wineries in Napa Valley that offer Mediterranean cuisine."
query = user_prompt
query_embeddins = model.encode(query, convert_to_tensor=False)

In [234]:
# Get similarities
similarities = model.similarity(query_embeddins, embeddings)
print(similarities)

tensor([[ 4.3333e-01,  4.3392e-01,  4.2818e-01,  4.2511e-01,  4.3043e-01,
          1.5788e-02, -3.4281e-03,  5.3955e-02,  4.4739e-02,  1.2332e-01,
          2.5099e-02,  3.5134e-02,  7.1996e-02,  1.1177e-01,  1.1126e-01,
          1.1863e-01,  2.9745e-02,  1.6952e-01,  9.5700e-02,  1.0261e-01,
          1.7425e-01,  9.1837e-02,  1.0026e-01,  3.9995e-03,  2.1592e-02,
          4.6996e-03,  7.0225e-03,  3.4205e-02,  8.0389e-02,  4.8912e-01,
          4.5812e-01,  2.9997e-01,  1.2350e-01,  1.1840e-01,  1.1120e-01,
          1.2129e-01,  1.0508e-01,  2.4788e-02,  1.7118e-02,  1.3667e-02,
          1.2717e-02,  2.4388e-01,  2.6089e-01,  1.4937e-01,  1.1218e-01,
          8.0452e-02,  1.3174e-01,  1.2126e-01,  7.9374e-02,  3.6005e-02,
          3.1303e-02,  5.2110e-03,  6.8236e-02,  4.1391e-02,  4.8689e-02,
          1.7835e-02,  3.1033e-01,  2.4985e-01,  3.2177e-02,  4.0236e-03,
          3.1360e-02,  6.4011e-02,  3.4536e-02, -2.0160e-02,  5.4355e-02,
          6.0214e-03,  5.4480e-02,  5.

In [235]:
# Get the top k similar indices
import tensorflow as tf
top_values, top_indices = tf.math.top_k(similarities, k=10)

print("Similarity scores:", top_values.numpy())
print("Top indices:", top_indices.numpy())

Similarity scores: [[0.4891241  0.4581176  0.43391582 0.4333338  0.43042767 0.42818287
  0.42511314 0.31032544 0.30919996 0.2999651 ]]
Top indices: [[ 29  30   1   0   4   2   3  56 177  31]]


In [236]:
# Display the top 5 most relevant elements
indexes = top_indices.numpy()

print ("Top 5 most relevant elements:\n")
for i in indexes:
    print(parsed_task.iloc[i].to_string(index=False))

Top 5 most relevant elements:

backend_node_id    tag                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

#### Evaluating results

In [237]:
# Task positive candidates
print(task['actions'][0]['pos_candidates'])

[{'attributes': '{"backend_node_id": "110", "bounding_box_rect": "557.671875,634.390625,24,24", "class": "MuiSvgIcon-root css-tdzr9e", "data_pw_testid_buckeye_candidate": "1"}', 'backend_node_id': '110', 'is_original_target': True, 'is_top_level_target': True, 'tag': 'svg'}]


In [238]:
def find_node_with_id(id):
    return parsed_task[parsed_task['path'].str.contains(id)]

In [240]:
# Find rows where the 'path' column contains the specified value
results = find_node_with_id('110')
print(results.to_string(index=False))

backend_node_id tag text                                                    path  is_interactive role aria_label placeholder
            110 svg      561/562/569/570/647/648/649/663/664/722/725/727/728/110            True None       None        None


In [241]:
found_nodes = find_node_with_id('727')
print(found_nodes.to_string(index=False))

backend_node_id    tag text                                                    path  is_interactive role      aria_label placeholder
            728 button          561/562/569/570/647/648/649/663/664/722/725/727/728            True None Select location        None
            110    svg      561/562/569/570/647/648/649/663/664/722/725/727/728/110            True None            None        None


In [255]:
# Print found nodes as tree
from typing import Dict, List
from dataclasses import dataclass

@dataclass
class Element:
    id: str
    tag: str
    children: List['Element']

def create_tree(paths_data: List[dict]) -> Dict[str, Element]:
    elements = {}
    
    # First, create all elements that are explicitly listed
    for item in paths_data:
        elements[item['backend_node_id']] = Element(
            id=item['backend_node_id'],
            tag=item['tag'],
            children=[]
        )
    
    # Then, establish parent-child relationships
    for item in paths_data:
        path = item['path'].split('/')
        current_id = item['backend_node_id']
        
        # Find the parent of this element
        parent_index = path.index(current_id) - 1
        if parent_index >= 0:
            parent_id = path[parent_index]
            if parent_id in elements:  # Only connect if parent is in our list
                elements[parent_id].children.append(elements[current_id])
    
    # Find the root(s) - elements that are not children of any other element
    all_children = set(child.id for element in elements.values() 
                       for child in element.children)
    roots = {id: elem for id, elem in elements.items() 
             if id not in all_children}
    
    return roots

def print_tree(element: Element, level: int = 0):
    indent = "  " * level
    print(f"{indent}- {element.id} ({element.tag})")
    for child in element.children:
        print_tree(child, level + 1)

In [259]:
# Print results as a tree
roots = create_tree(find_node_with_id("722").to_dict(orient='records'))
for root in roots.values():
    print_tree(root)

- 722 (div)
  - 723 (label)
    - 724 (text)
- 726 (input)
- 728 (button)
  - 110 (svg)
