In [67]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import torchvision
import torchvision.transforms as transforms
from bs4 import BeautifulSoup
from torch.utils.data import DataLoader, TensorDataset
import os, sys
project_root = os.path.abspath('/Users/subhojit/workspace/lcow_iclr2025/browsergym')
if project_root not in sys.path:
    sys.path.append(project_root)

from dev.embedding_store import *
from dev.sim_dim_gen import *

In [None]:
class MLP(nn.Module):
    def __init__(self, input_size=7680, hidden_size=200):
        super(MLP, self).__init__()
        self.layer1 = nn.Linear(input_size, hidden_size)
        self.layer2 = nn.Linear(hidden_size, hidden_size)
        self.layer3 = nn.Linear(hidden_size, 1)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = self.layer3(x)
        x = torch.sigmoid(x)
        return x

In [None]:
import pandas as pd
trajectory_file_path = '/Users/subhojit/workspace/lcow_iclr2025/browsergym/workarena_seed_demo_iter_0.csv'
df = pd.read_csv(trajectory_file_path)
df.head()




In [None]:
import openai
import numpy as np
from sklearn.decomposition import PCA

openai.api_key = ""

texts = [
    "WILBUR learns web navigation by in-context learning.",
    "Language models can solve many real-world tasks.",
    "Web agents must understand HTML structure."
]

embeddings = []
for text in texts:
    response = openai.embeddings.create(
        model="text-embedding-3-large",
        input=text
    )
    emb = response.data[0].embedding
    embeddings.append(emb)

embeddings = np.array(embeddings)

pca = PCA(n_components=3)
reduced_embeddings = pca.fit_transform(embeddings)

print("Reduced shape:", reduced_embeddings.shape)
print(reduced_embeddings)




In [None]:
from datasets import load_dataset

ds = load_dataset("osunlp/Mind2Web")

In [None]:
sample = ds['train'][0]
print(len(ds['train']))
action_rep = sample['action_reprs']
# print("Action representation: ", action_rep)
goal = sample["confirmed_task"]
# print("Goal: ", goal)
actions = sample['actions']
# print("Actions: ", actions)
init_dom = actions[0]['cleaned_html']
# print('init dom: ', init_dom)


In [None]:
import json
count = 0
res = []
for elem in ds['train']:
    acts = elem['actions']
    for a in acts:
        res.append({'action': a['operation'], 'dom': a['cleaned_html']})
        count += 1
    if count > 100:
        break

with open('training_data_new.json', 'w') as f:
    json.dump(res, f, indent=2)



In [None]:
import tiktoken
from bs4 import BeautifulSoup

encoding = tiktoken.encoding_for_model("text-embedding-3-large")

with open("training_data_new_aug.json", 'r') as f:
    tasks = json.load(f)

def extract_text_from_html(html: str) -> str:
    soup = BeautifulSoup(html, "html.parser")
    for tag in soup(["script", "style", "noscript"]):
        tag.decompose()
    return soup.get_text(separator=" ", strip=True)

for task in tasks:
    text = task['dom']
    ex_text = extract_text_from_html(text)
    num_tokens = len(encoding.encode(ex_text))
    task['ext_dom'] = ex_text

with open('training_data_new_aug_1.json', 'w') as f:
    json.dump(tasks, f, indent=2)


In [51]:
cache = load_cache()

In [84]:
with open("training_data_new_aug_1.json", 'r') as f:
    tasks = json.load(f)

input = []
target = []
for task in tasks:
    act = task['nl_action']
    ext_dom = task['ext_dom']
    act_emb = get_embedding(act, cache)
    ext_emb = get_embedding(ext_dom, cache)
    sim_emb = get_embedding(get_sim(act), cache)
    opp_emb = get_embedding(get_opp(act), cache)
    sim_con = np.concatenate((act_emb, ext_emb, sim_emb), axis=0)
    input.append(sim_con)
    target.append(1)
    opp_con = np.concatenate((act_emb, ext_emb, opp_emb), axis=0)
    input.append(opp_con)
    target.append(0)

X = np.array(input)
y = np.array(target)


In [85]:
X.shape, y.shape

((202, 9216), (202,))

In [64]:
# from sklearn.decomposition import PCA
#
# pca = PCA(n_components=202)
# X_reduced = pca.fit_transform(X)

In [86]:
X_reduced = X
X_reduced.shape

(202, 9216)

In [87]:
device = "mps"

model = MLP(input_size=X_reduced.shape[1], hidden_size=200).to(device)

X_tensor = torch.tensor(X_reduced, dtype=torch.float32).to(device)
y_tensor = torch.tensor(y, dtype=torch.float32).view(-1, 1).to(device)

dataset = TensorDataset(X_tensor, y_tensor)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [88]:
criterion = nn.BCELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)


In [89]:
for epoch in range(50):
    model.train()
    total_loss = 0.0

    for X_batch, y_batch in dataloader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()
        output = model(X_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(dataloader):.4f}")

Epoch 1, Loss: 0.6697
Epoch 2, Loss: 0.5186
Epoch 3, Loss: 0.2699
Epoch 4, Loss: 0.1244
Epoch 5, Loss: 0.0833
Epoch 6, Loss: 0.0604
Epoch 7, Loss: 0.0402
Epoch 8, Loss: 0.0319
Epoch 9, Loss: 0.0203
Epoch 10, Loss: 0.0151
Epoch 11, Loss: 0.0109
Epoch 12, Loss: 0.0058
Epoch 13, Loss: 0.0052
Epoch 14, Loss: 0.0033
Epoch 15, Loss: 0.0025
Epoch 16, Loss: 0.0020
Epoch 17, Loss: 0.0020
Epoch 18, Loss: 0.0017
Epoch 19, Loss: 0.0014
Epoch 20, Loss: 0.0013
Epoch 21, Loss: 0.0012
Epoch 22, Loss: 0.0010
Epoch 23, Loss: 0.0009
Epoch 24, Loss: 0.0008
Epoch 25, Loss: 0.0007
Epoch 26, Loss: 0.0006
Epoch 27, Loss: 0.0005
Epoch 28, Loss: 0.0005
Epoch 29, Loss: 0.0005
Epoch 30, Loss: 0.0004
Epoch 31, Loss: 0.0004
Epoch 32, Loss: 0.0003
Epoch 33, Loss: 0.0003
Epoch 34, Loss: 0.0003
Epoch 35, Loss: 0.0002
Epoch 36, Loss: 0.0002
Epoch 37, Loss: 0.0002
Epoch 38, Loss: 0.0002
Epoch 39, Loss: 0.0002
Epoch 40, Loss: 0.0002
Epoch 41, Loss: 0.0001
Epoch 42, Loss: 0.0002
Epoch 43, Loss: 0.0002
Epoch 44, Loss: 0.00

In [75]:
model.eval()
with torch.no_grad():
    preds = model(X_tensor)
    preds_binary = (preds > 0.5).float()
    accuracy = (preds_binary == y_tensor).float().mean()
    print(f"Accuracy: {accuracy:.4f}")

Accuracy: 1.0000


In [90]:
torch.save(model.state_dict(), "mlp_model.pt")

In [91]:
model.load_state_dict(torch.load("mlp_model.pt", map_location=device))

<All keys matched successfully>

In [100]:
with open("training_data_new_aug_1.json", 'r') as f:
    tasks = json.load(f)

t = tasks[0]
act = t['nl_action']
ext_dom = t['ext_dom']
act_emb = get_embedding(act, cache)
ext_emb = get_embedding(ext_dom, cache)
sim_emb = get_embedding(get_sim(act), cache)
opp_emb = get_embedding(get_opp(act), cache)
sim_con = np.concatenate((act_emb, ext_emb, sim_emb), axis=0)
opp_con = np.concatenate((act_emb, ext_emb, opp_emb), axis=0)

input_tensor = torch.tensor(sim_con, dtype=torch.float32).unsqueeze(0).to(device)

with torch.no_grad():
    output = model(input_tensor)
    prediction = (output > 0.5).float()

print(f"Predicted probability: {output.item():.8f}")
print(f"Predicted class: {int(prediction.item())}")

Predicted probability: 0.99889827
Predicted class: 1
