# DATASET TRANSFORMATION


### Mind2Web data

    "annotation_id" (str): unique id for each task
    "website" (str): website name
    "domain" (str): website domain
    "subdomain" (str): website subdomain
    "confirmed_task" (str): task description
    "action_reprs" (list[str]): human readable string representation of the - action sequence
    "actions" (list[dict]): list of actions (steps) to complete the task
        "action_uid" (str): unique id for each action (step)
        "raw_html" (str): raw html of the page before the action is performed
        "cleaned_html" (str): cleaned html of the page before the action is performed
        "operation" (dict): operation to perform
            "op" (str): operation type, one of CLICK, TYPE, SELECT
            "original_op" (str): original operation type, contain additional HOVER and ENTER that are mapped to CLICK, not used
            "value" (str): optional value for the operation, e.g., text to type, option to select
        "pos_candidates" (list[dict]): ground truth elements. Here we only include positive elements that exist in "cleaned_html" after our preprocessing, so "pos_candidates" might be empty. The original labeled element can always be found in the "raw_html".
            "tag" (str): tag of the element
            "is_original_target" (bool): whether the element is the original target labeled by the annotator
            "is_top_level_target" (bool): whether the element is a top level target find by our algorithm. please see the paper for more details.
            "backend_node_id" (str): unique id for the element
            "attributes" (str): serialized attributes of the element, use json.loads to convert back to dict
        "neg_candidates" (list[dict]): other candidate elements in the page after preprocessing, has similar structure as "pos_candidates"


## LOAD TRANSFORM AND SAVE DATSET

#### LOADING

In [11]:
# Load the dataset from huggingface
from datasets import load_dataset

ds = load_dataset("osunlp/Mind2Web")
ds_train = ds['train']

#### PARSING DATASET

In [18]:
# From the HTML of each task extract the nodes that are in the positive candidates and negative candidates and have text
from dataclasses import dataclass
import re
# Class for storing all the node's data
@dataclass
class Node:
    def __init__(self, node_id, text, pos_candidate):
        self.node_id = node_id
        self.text = text
        self.pos_candidate = pos_candidate
        
    def to_dict(self):
        return {
            'node_id': self.node_id,
            'text': self.text,
            'pos_candidate': self.pos_candidate
        }
        
@dataclass
class Task:
    def __init__(self, task_id, prompt, nodes):
        self.task_id = task_id
        self.prompt = prompt
        self.nodes = nodes
        
    def to_dict(self):
        return {
            'task_id': self.task_id,
            'nodes': [node.to_dict() for node in self.nodes]
        }

# Extract the nodes from the HTML       
def extract_nodes(html, positive_candidates, negative_candidates):
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')
    nodes = []
    
    positive_candidates_id = {candidate['backend_node_id'] for candidate in positive_candidates}
    negative_candidates_id = {candidate['backend_node_id'] for candidate in negative_candidates}
    
    elements = soup.find_all()
    for element in elements:
        if element.has_attr('backend_node_id'):
            # Get the node id and the text
            #NOTE: THERE ARE SOME NODES THAT HAVE NOT TEXT OR ARIA_LABEL
            # WE COULD CLEAN UP  THE TEXT AND REMOVE STOPWORDS
            node_id = element.get('backend_node_id')
            node_text = re.sub(r'\s+', ' ', element.text).strip()
            if node_text == '' or node_text == 'NaN':
                node_text = element.get('aria_label')
                
            if node_id in positive_candidates_id:
                nodes.append(Node(node_id, node_text, True))
            elif node_id in negative_candidates_id:
                nodes.append(Node(node_id, node_text, False))
                
    return nodes

# Extrac the data from a task
def extract_data_from_task(task):
    # Get the task data
    task_id = task['annotation_id']
    prompt = task['confirmed_task']
    
    nodes = []
    # Extract data from the task actions
    for action in task['actions']:
        pos_candidates = action['pos_candidates']
        neg_candidates = action['neg_candidates']
        html = action['cleaned_html']
        
        nodes += extract_nodes(html, pos_candidates, neg_candidates)
    
    # We could make the prompt deconstructio before storing it or after
    return Task(task_id, prompt, nodes)
    
# Extract the data from the all the dataset and store it as a csv file
def extract_data(dataset):
    # Progress variables
    progress = 0
    total = len(dataset)
    # Init variables
    tasks = []
    # print progress bar
    print(f'Progress: {progress}/{total}', end='\r')
    for task in dataset:
        tasks.append(extract_data_from_task(task))
        progress += 1
        print(f'Progress: {progress}/{total}', end='\r')
        
    
    print('Data extracted from the dataset')
    return tasks


def store_data(tasks):
    import pandas as pd
    
    task_prompts_df = pd.DataFrame([{'task_id': task.task_id, 'prompt': task.prompt} for task in tasks])
    nodes_df = pd.DataFrame([{'task_id': task.task_id, 'node_id': node.node_id, 'text': node.text, 'pos_candidate': node.pos_candidate} for task in tasks for node in task.nodes])
    
    task_prompts_df.to_csv('dataset/task_prompts.csv', index=False)
    nodes_df.to_csv('dataset/nodes.csv', index=False)
    
    print('Data stored at dataset/task_prompts.csv and dataset/nodes.csv')

In [19]:
# Test data extraction from dataset
test_set = ds_train.select(range(1))
tasks = extract_data(test_set)
store_data(tasks)

Data extracted from the dataset
Data stored at dataset/task_prompts.csv and dataset/nodes.csv


#### EXTRACT DATASET DATA

In [20]:
tasks = extract_data(ds_train)
store_data(tasks)

Data extracted from the dataset
Data stored at dataset/task_prompts.csv and dataset/nodes.csv


## INDEXING

#### LOAD DATA

In [37]:
import pandas as pd
def load_data():
    task_prompts_df = pd.read_csv('dataset/task_prompts.csv')
    nodes_df = pd.read_csv('dataset/nodes.csv')
    
    return task_prompts_df, nodes_df

def generate_embeddings(task_prompts_df, nodes_df, model):
    task_prompts = task_prompts_df['prompt'].tolist()
    nodes_texts = nodes_df['text'].tolist()
    
    # Convert all elements to strings
    task_prompts = [str(prompt) for prompt in task_prompts]
    nodes_texts = [str(text) for text in nodes_texts]
    
    task_prompts_embeddings = model.encode(task_prompts)
    nodes_texts_embeddings = model.encode(nodes_texts)
    return task_prompts_embeddings, nodes_texts_embeddings

In [22]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')

task_prompts_df, nodes_df = load_data()



In [23]:
print(len(task_prompts_df), len(nodes_df))

1009 4683115


#### GENERATE EMBEDINGS

In [38]:
# Get only one task for testing
test_task = task_prompts_df.head(1)
test_nodes = nodes_df[nodes_df['task_id'] == test_task['task_id'].values[0]]

# Generate embedings for the test task
task_prompts_embeddings, nodes_texts_embeddings = generate_embeddings(test_task, test_nodes, model)

# Store embedings for testing evaluation
import numpy as np
np.save('dataset/task_prompts_embeddings.npy', task_prompts_embeddings)
np.save('dataset/nodes_texts_embeddings.npy', nodes_texts_embeddings)

print(task_prompts_embeddings.shape, nodes_texts_embeddings.shape)

(1, 768) (2076, 768)


In [None]:
# Genereta emebeddings for the task prompts and nodes texts
task_prompts_embeddings, nodes_texts_embeddings = generate_emebeddings(task_prompts_df, nodes_df, model)

# Store the embeddings
import numpy as np

np.save('dataset/task_prompts_embeddings.npy', task_prompts_embeddings)
np.save('dataset/nodes_texts_embeddings.npy', nodes_texts_embeddings)

print('Embeddings stored at dataset/task_prompts_embeddings.npy and dataset/nodes_texts_embeddings.npy')

## RETRIEVING

#### EMBEDINGS LOADING