# RAG development  using SBERT

### Model setup

#### Installing sbert transformers

In [1]:
%pip install -U sentence-transformers

Note: you may need to restart the kernel to use updated packages.


#### Setup embeddings model

In [3]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

  from tqdm.autonotebook import tqdm, trange
2024-10-05 16:41:46.701312: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-05 16:41:46.799005: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-05 16:41:47.429450: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## INDEXING

### LOAD

In [4]:
#@title Import Mind2Web dataset form huggingface
from datasets import load_dataset

# Text only dataset
ds = load_dataset("osunlp/Mind2Web")
# Multimodal dataset
#ds = load_dataset("osunlp/Multimodal-Mind2Web")

In [49]:
# Extract the train split from the dataset
train_ds = ds['train']

# Get an example task for testing
task = train_ds[1]
# Get the HTML from one of the task's actions for testing
html = task['actions'][0]['cleaned_html']
print(len(html))

42254


## SPLITTING

In [6]:
# Function to extract text from HTML and parse it
from bs4 import BeautifulSoup
import pandas as pd

# Function to create a path with parent ids
def create_path(node, path=''):
    # Recursively build the path using 'backend_node_id' attributes
    if not node or not node.has_attr('backend_node_id'):
        return path
    current_node_id = node['backend_node_id']
    parent_path = create_path(node.parent, path)
    return f"{parent_path}/{current_node_id}".lstrip('/')

# Function to parse the HTML and extract paths and content
def parse_html_for_rag(html_content):
    # Parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # List to store parsed data
    data = []

    # Iterate through all elements with 'backend_node_id'
    for element in soup.find_all(attrs={"backend_node_id": True}):
        # Create hierarchical path
        path = create_path(element)
        # Get the text content of the element
        text = element.get_text(strip=True)
        # Append to data list
        data.append({
            "backend_node_id": element["backend_node_id"],
            "tag": element.name,
            "text": text,
            "path": path
        })

    # Convert the data into a DataFrame
    df = pd.DataFrame(data)
    
    return df

# NOTE: We could clean the text further by removing special characters, etc.

In [50]:
parsed_task = parse_html_for_rag(html)
print(parsed_task)

    backend_node_id   tag                                               text  \
0               117  html  Skip to main contentUse Tock at your businessB...   
1               561   div  Skip to main contentUse Tock at your businessB...   
2               562   div  Skip to main contentUse Tock at your businessB...   
3               569   div  Skip to main contentUse Tock at your businessB...   
4               570   div  Skip to main contentUse Tock at your businessB...   
..              ...   ...                                                ...   
470            1422  text                                           Facebook   
471            1424  span                                  Explore Tock 2023   
472            1425  text                                  Explore Tock 2023   
473            1511     a                                               null   
474            1512  text                                               null   

                                       

### EMBEDDINGS

In [51]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np

# Function to create embeddings from HTML content dataframe
def generate_html_embeddings(df, embedding_model):
    # Combine the relevant text fields into a single string for embedding
    df['text_combined'] = df.apply(lambda row: f"{row['tag']} {row['path']} {row['text']}", axis=1)
    
    # Generate embeddings for each combined text field
    embeddings = embedding_model.encode(df['text_combined'].tolist(), convert_to_tensor=False)
    
    return embeddings

In [52]:
embeddings = generate_html_embeddings(parsed_task.copy(), model)
print(embeddings.shape)

(475, 384)


### RETRIEVAL

In [53]:
# Extract the user prompt from the task
# NOTE: 
#   1. We could use some pre-reasoning to include with the user prompt
#   2. We should use some way to enrich the query with task information like the reasoning and the current step
user_prompt = task['confirmed_task']
print(user_prompt)

Book a winery tour in Napa Valley in a winery which serves Mediterranean cuisine with wine testing for 4 guests on April 15, 10 am in a outdoor setup.


In [54]:
# Prompt embeddings
user_prompt_embedding = model.encode(user_prompt, convert_to_tensor=False)

In [55]:
# Get similarities
similarities = model.similarity(user_prompt_embedding, embeddings)
print(similarities)

tensor([[ 3.2217e-01,  3.4749e-01,  3.3877e-01,  3.1184e-01,  3.2577e-01,
          6.4265e-02, -2.2831e-02,  1.0293e-01,  4.4012e-02,  1.6536e-01,
          4.6509e-03,  8.4426e-02, -4.6601e-03,  6.3669e-02,  5.5652e-02,
          1.6974e-01,  1.6421e-01,  1.6076e-01,  9.3829e-02,  1.9727e-01,
          1.9513e-01,  1.3535e-01,  1.9712e-01,  1.8766e-01,  1.3866e-01,
         -1.8315e-02,  3.0039e-02,  7.9458e-03, -1.9339e-02,  3.8067e-02,
          4.2775e-02, -3.1243e-02,  3.8221e-01,  3.7619e-01,  2.9068e-01,
          1.3713e-01,  1.3352e-01,  1.2558e-01,  1.7823e-01,  8.9068e-02,
          1.1478e-01,  2.7420e-03,  9.3036e-02, -9.6099e-03,  2.3148e-01,
          2.3479e-01,  9.0501e-02,  1.8497e-01,  8.7875e-02,  1.1716e-01,
          1.0222e-01,  1.9760e-01,  9.6374e-02,  1.0586e-01,  8.0904e-03,
          1.4517e-01,  5.0066e-02,  1.3921e-01,  5.5587e-02,  3.8763e-01,
          2.5408e-01,  8.6707e-02, -2.3203e-02,  1.8426e-02,  1.3860e-01,
          1.2995e-02, -1.6357e-02, -2.

In [56]:
import tensorflow as tf
top_values, top_indices = tf.math.top_k(similarities, k=5)

print("Similarity scores:", top_values.numpy())
print("Top indices:", top_indices.numpy())

Similarity scores: [[0.39611417 0.39119527 0.38763046 0.38220945 0.37618726]]
Top indices: [[350 349  59  32  33]]


In [57]:
indexes = top_indices.numpy()

print ("Top 5 most relevant elements:\n")
for i in indexes:
    print(parsed_task.iloc[i].to_string(index=False))

Top 5 most relevant elements:

backend_node_id     tag                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  

#### Evaluating results

In [58]:
print(task['actions'][0]['pos_candidates'])

[{'attributes': '{"backend_node_id": "110", "bounding_box_rect": "557.671875,634.390625,24,24", "class": "MuiSvgIcon-root css-tdzr9e", "data_pw_testid_buckeye_candidate": "1"}', 'backend_node_id': '110', 'is_original_target': True, 'is_top_level_target': True, 'tag': 'svg'}]


In [73]:
# Find rows where the 'path' column contains the specified value
path_value = "110"
result = parsed_task[parsed_task['path'].str.contains(path_value)]

print(result.to_string(index=False))

backend_node_id tag text                                                    path
            110 svg      561/562/569/570/647/648/649/663/664/722/725/727/728/110
