In [None]:
import os
import json

# 확인할 디렉토리들
TARGET_DIRS = ["train"]

OUTPUT_FILE = "train/mind2web_task_train.jsonl"  # 결과 저장 파일

base_path = "/mnt/raid5/parksh/Mind2web"

results = []

for folder in TARGET_DIRS:
    folder_path = os.path.join(base_path, folder)

    if not os.path.exists(folder_path):
        print(f"경로 없음: {folder_path}")
        continue

    for filename in os.listdir(folder_path):
        if not filename.endswith(".json"):
            continue

        file_path = os.path.join(folder_path, filename)
        #file_name = filename

        try:
            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)

            for idx, item in enumerate(data):
                annotation_id = item["annotation_id"] if "annotation_id" in item else None
                domain = item["domain"] if "domain" in item else None
                subdomain = item["subdomain"] if "subdomain" in item else None
                website = item["website"] if "website" in item else None
                confirmed = item["confirmed_task"] if "confirmed_task" in item else None

                actions = []
                for a in item.get("actions", []):
                    actions.append({
                        "action_uid": a.get("action_uid"),
                        "operation": a.get("operation"),          # {"op":..., "value":..., "original_op":...}
                        "pos_candidates": a.get("pos_candidates"), #{"tag", "is_original_target", "is_top_level_target", "backend_node_id", "attributes"}
                    })

                results.append({
                    "id": annotation_id,
                    "domain": domain,
                    "sub_domain": subdomain,
                    "website": website,
                    "confirmed_task": confirmed,
                    "action_reprs": item.get("action_reprs"),
                    "actions": actions     
                })

        except Exception as e:
            print(f"에러 발생: {file_path}, {e}")

# 결과 저장
with open(os.path.join(base_path, OUTPUT_FILE), "w", encoding="utf-8") as out:
    for item in results:
        out.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"train / 완료! 총 {len(results)}개 confirmed_task 추출 완료")
print(f"결과 저장 위치: {os.path.join(base_path, OUTPUT_FILE)}")


train / 완료! 총 1009개 confirmed_task 추출 완료
결과 저장 위치: /mnt/raid5/parksh/Mind2web/train/mind2web_task_train.jsonl


In [3]:
import os
import json

# 확인할 디렉토리들
TARGET_DIRS = ["test_domain", "test_task", "test_website"]

OUTPUT_FILE = "mind2web_task_test.jsonl"  # 결과 저장 파일

base_path = "/mnt/raid5/parksh/Mind2web/test"

results = []

for folder in TARGET_DIRS:
    folder_path = os.path.join(base_path, folder)

    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        #file_name = filename

        try:
            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)

            for idx, item in enumerate(data):
                annotation_id = item["annotation_id"] if "annotation_id" in item else None
                domain = item["domain"] if "domain" in item else None
                subdomain = item["subdomain"] if "subdomain" in item else None
                website = item["website"] if "website" in item else None
                confirmed = item["confirmed_task"] if "confirmed_task" in item else None

                results.append({
                    "id": annotation_id,
                    "domain": domain,
                    "sub_domain": subdomain,
                    "website": website,
                    "confirmed_task": confirmed
                })


        except Exception as e:
            print(f"에러 발생: {file_path}, {e}")

# 결과 저장
with open(os.path.join(base_path, OUTPUT_FILE), "w", encoding="utf-8") as out:
    for item in results:
        out.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"test / 완료! 총 {len(results)}개 confirmed_task 추출 완료")
print(f"결과 저장 위치: {os.path.join(base_path, OUTPUT_FILE)}")

test / 완료! 총 1341개 confirmed_task 추출 완료
결과 저장 위치: /mnt/raid5/parksh/Mind2web/test/mind2web_task_test.jsonl


In [5]:
### 두개 이상의 파일을 병합하는 코드

import json

train_path = "train/mind2web_task_train.jsonl"
test_path = "test/mind2web_task_test.jsonl"
output_path = "mind2web_task_all.jsonl"

total_lines = 0

with open(output_path, "w", encoding="utf-8") as outfile:
    # train 파일 먼저 쓰기
    with open(train_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            outfile.write(line + "\n")
            total_lines += 1

    # test 파일 이어서 쓰기
    with open(test_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            outfile.write(line + "\n")
            total_lines += 1

print(f"Total merged lines: {total_lines}")
print(f"Output saved to: {output_path}")


Total merged lines: 2350
Output saved to: mind2web_task_all.jsonl


In [6]:
import json

MIND2WEB_TASK_ALL = "mind2web_task_all.jsonl"
ONLINE_MIND2WEB = "online_mind2web/Online_Mind2Web.json"

# -----------------------
# 1) mind2web_task_all.jsonl에서 id 집합 만들기 (큰 집합)
# -----------------------
mind2web_ids = set()

with open(MIND2WEB_TASK_ALL, "r", encoding="utf-8") as f:
    for line in f:
        obj = json.loads(line)
        mid = obj.get("id")
        if mid:
            mind2web_ids.add(mid)

print(f"[mind2web_task_all] total ids: {len(mind2web_ids)}")

# -----------------------
# 2) online_mind2web에서 task_id 집합 만들기 (작은 집합)
# -----------------------
with open(ONLINE_MIND2WEB, "r", encoding="utf-8") as f:
    online_data = json.load(f)

online_ids = set()
for obj in online_data:
    tid = obj.get("task_id")
    if tid:
        online_ids.add(tid)

print(f"[online_mind2web] total task_ids: {len(online_ids)}")

# -----------------------
# 3) 포함 여부 검사
# -----------------------
matched = online_ids & mind2web_ids
missing = online_ids - mind2web_ids

print("\n================ RESULT ================")
print(f"Matched (online ∩ mind2web): {len(matched)}")
print(f"Missing (online - mind2web): {len(missing)}")

# -----------------------
# 4) 결과 출력
# -----------------------
if len(missing) == 0:
    print("✅ SUCCESS: online_mind2web의 모든 task_id가 mind2web_task_all에 포함되어 있습니다.")
else:
    print("❌ WARNING: 일부 task_id가 mind2web_task_all에 존재하지 않습니다.\n")
    print("❗ 누락된 task_id 목록:")
    for tid in sorted(missing):
        print(tid)


[mind2web_task_all] total ids: 2350
[online_mind2web] total task_ids: 300

Matched (online ∩ mind2web): 0
Missing (online - mind2web): 300

❗ 누락된 task_id 목록:
0059adc6b12a3822305deb68929b2de8
005be9dd91c95669d6ddde9ae667125c
0170ca95038b05fa58d463fe627ac605
01abae9608f2d8752a83e08f136f720c
046138801a05ddf56ad94e8672942496
05483c50cc9b04c8ac44c574758fb2bd
0632e496d37badee0350dad358f047c5
070c907d34a4ce71dfdbea38f9c5d4d8
07bdc595306729a028ba06cc7451a80a
07ec4a12cba8090e2dc524d558ac7675
0a0fa834ce41b5297c6474293383759d_110325
0a54069a0ef542e571d1fee7f39c93d5
0b2623e9fa5cea997f76490bcbc5220f
0b51b4fa0295ae80ccd176ebdad6fff6
0b838cd54f826c59c71f600c56b89a11
0e42c3a73f2aece1f854e0ba55b7c8b0
0e5536aaad9d3462b06cf725e6ed535a
11857213ca01510f12813740afd59918_110325
11abb668c751dd56bb41f296a8bb3a13
1223b07536a87e0170ff87cbbebd1d3c
123e8c2fc453f55fadd1d0b9aaf94df4
157f4a79d55e8fa3fd55ba772ba40fbc
15be05973fba714e490cd9c884e4f072
16200f51d63f0a47a58fa17acd49e368
180ed2ec377ef3a4af9035a21522091a
199

In [7]:
import json

MIND2WEB_TASK_ALL = "mind2web_task_all.jsonl"
ONLINE_MIND2WEB = "online_mind2web/Online_Mind2Web.json"


def normalize_text(t: str) -> str:
    """
    앞뒤 공백 제거 + 중복 공백/개행 정규화
    """
    if not t:
        return ""
    return " ".join(t.split())


# -----------------------
# 1) mind2web_task_all.jsonl → confirmed_task 집합 (큰 집합)
# -----------------------
mind2web_tasks = set()

with open(MIND2WEB_TASK_ALL, "r", encoding="utf-8") as f:
    for line in f:
        obj = json.loads(line)
        task = normalize_text(obj.get("confirmed_task"))
        if task:
            mind2web_tasks.add(task)

print(f"[mind2web_task_all] total unique confirmed_task: {len(mind2web_tasks)}")


# -----------------------
# 2) online_mind2web → task_description/confirmed_task 집합 (작은 집합)
# -----------------------
with open(ONLINE_MIND2WEB, "r", encoding="utf-8") as f:
    online_data = json.load(f)

online_tasks = set()
raw_online_map = {}  # 정규화된 task -> 원본 task

for obj in online_data:
    task = obj.get("confirmed_task") or obj.get("task_description")
    norm = normalize_text(task)
    if norm:
        online_tasks.add(norm)
        raw_online_map[norm] = task

print(f"[online_mind2web] total unique tasks: {len(online_tasks)}")


# -----------------------
# 3) 포함 여부 검사
# -----------------------
matched = online_tasks & mind2web_tasks
missing = online_tasks - mind2web_tasks

print("\n================ RESULT ================")
print(f"Matched (online ⊆ mind2web): {len(matched)}")
print(f"Missing (online - mind2web): {len(missing)}")


# -----------------------
# 4) 결과 출력
# -----------------------
if len(missing) == 0:
    print("✅ SUCCESS: online_mind2web의 모든 confirmed_task가 mind2web_task_all에 포함되어 있습니다.")
else:
    print("❌ WARNING: 일부 confirmed_task가 mind2web_task_all에 존재하지 않습니다.\n")
    print("❗ 누락된 confirmed_task 목록:\n")

    for t in sorted(missing):
        print("-", raw_online_map.get(t, t))


[mind2web_task_all] total unique confirmed_task: 2350
[online_mind2web] total unique tasks: 300

Matched (online ⊆ mind2web): 60
Missing (online - mind2web): 240

❗ 누락된 confirmed_task 목록:

- Add Elevate at Chicago, IL, to favorites and show a virtual tour.
- Add a $100 Best Buy gift card for a birthday to my cart.
- Add a $50 Uber gift card to the cart.
- Add a Box Combo to my bag with Diet Coke as the drink, and a Kids Combo with milk as the drink. Select the store closest to ZIP 10001 for pickup tomorrow at 12:00 PM.
- Add the best-selling men's T-shirt in large size, short sleeve, and Halloween event style to my cart.
- Add the cheapest black sofa with at least three seats, a leather finish, and at least four stars to my cart.
- Add the cheapest certified refurbished iPad Air with 256GB of storage in any shade of blue to my bag.
- Add the most top-selling Birkenstock's men clogs in brown, size 10-10.5 to my cart.
- Browse Marriott Bonvoy credit cards on Marriott.
- Browse a user hom

In [13]:
import json
import re
from difflib import SequenceMatcher

MIND2WEB_TASK_ALL = "mind2web_task_all.jsonl"
ONLINE_MIND2WEB = "online_mind2web/Online_Mind2Web.json"

# 유사도 임계값 (0~1). 0.85~0.92 사이에서 조정 추천
THRESHOLD = 0.7

def normalize_text(t: str) -> str:
    """소문자화 + 구두점 제거 + 공백 정규화"""
    if not t:
        return ""
    t = t.lower().strip()
    t = re.sub(r"[^\w\s]", " ", t)     # punctuation 제거
    t = " ".join(t.split())           # 공백 정리
    return t

def similarity(a: str, b: str) -> float:
    return SequenceMatcher(None, a, b).ratio()

# -----------------------
# 1) mind2web confirmed_task 로드 (큰 집합)
# -----------------------
mind_raw = []
mind_norm = []

with open(MIND2WEB_TASK_ALL, "r", encoding="utf-8") as f:
    for line in f:
        obj = json.loads(line)
        t = obj.get("confirmed_task")
        if not t:
            continue
        mind_raw.append(t)
        mind_norm.append(normalize_text(t))

print(f"[mind2web] loaded tasks: {len(mind_raw)}")

# -----------------------
# 2) online tasks 로드 (작은 집합)
# -----------------------
with open(ONLINE_MIND2WEB, "r", encoding="utf-8") as f:
    online_data = json.load(f)

online_tasks = []
for obj in online_data:
    t = obj.get("confirmed_task") or obj.get("task_description")
    if t:
        online_tasks.append(t)

print(f"[online] loaded tasks: {len(online_tasks)}")

# -----------------------
# 3) 퍼지 매칭 수행
# -----------------------
matched = []   # (online_task, best_mind_task, score)
missing = []   # (online_task, best_mind_task, score)

for ot in online_tasks:
    ot_norm = normalize_text(ot)

    best_score = -1.0
    best_idx = -1

    for i, mt_norm in enumerate(mind_norm):
        s = similarity(ot_norm, mt_norm)
        if s > best_score:
            best_score = s
            best_idx = i

    best_mind_task = mind_raw[best_idx] if best_idx >= 0 else None

    if best_score >= THRESHOLD:
        matched.append((ot, best_mind_task, best_score))
    else:
        missing.append((ot, best_mind_task, best_score))

# -----------------------
# 4) 결과 요약 + 불일치 출력
# -----------------------
print("\n================ RESULT ================")
print(f"Threshold: {THRESHOLD}")
print(f"Total online tasks: {len(online_tasks)}")
print(f"Matched: {len(matched)}")
print(f"Not matched: {len(missing)}")

if len(missing) == 0:
    print("✅ SUCCESS: 퍼지 매칭 기준으로 online의 모든 task가 mind2web에 포함된 것으로 판단됩니다.")
else:
    print("\n❌ NOT MATCHED LIST (score < threshold):")
    # 점수 낮은 것부터 보기 좋게 정렬
    missing_sorted = sorted(missing, key=lambda x: x[2])

    for ot, mt, sc in missing_sorted:
        print("\n---")
        print(f"online : {ot}")
        print(f"best   : {mt}")
        print(f"score  : {sc:.4f}")


[mind2web] loaded tasks: 2350
[online] loaded tasks: 300

Threshold: 0.7
Total online tasks: 300
Matched: 195
Not matched: 105

❌ NOT MATCHED LIST (score < threshold):

---
online : Complete a multiplication quiz on https://www.coolmath4kids.com/, covering multiplication facts for 11-12. The quiz should consist of 10 questions, with unlimited time allowed for each. The goal is to achieve a perfect score of 10 out of 10.
best   : Locate a large store in Washington that has kids' and maternity products, also check if they have a parking lot, and see the directions of the nearest store.
score  : 0.3523

---
online : My baby boy was born on Dec 1, 2025. His weight is 7 lb 8 oz, height is 20 inches, and head circumference is 35 cm. The measurement date is Dec 1, 2025. Can you calculate his growth percentiles using a calculator?
best   : Register to watch the webcast on developing Career ready student. My name is John Doe. My email is johndoe@gmail. I worked at UAI with a job title of Accoun

In [11]:
print("\n========== MATCHED PAIRS (sorted by score desc) ==========\n")

for online_task, mind_task, score in sorted(matched, key=lambda x: x[2], reverse=True):
    print(f"ONLINE   : {online_task}")
    print(f"MIND2WEB : {mind_task}")
    print(f"SCORE    : {score:.4f}")
    print()




ONLINE   : Find a shelter or rescue group near zip code 90011.
MIND2WEB : Find a shelter or rescue group near zip code 90011.
SCORE    : 1.0000

ONLINE   : Calculate a FedEx Ground shipping rate for a 3-pound package from zip code 10019 to zip code 90028.
MIND2WEB : Calculate a Fedex Ground shipping rate for a 3 pound package from zip code 10019 to zip code 90028.
SCORE    : 1.0000

ONLINE   : Estimate the cost of a photographer in 07055 for a 4-hour project.
MIND2WEB : Estimate the cost of a photographer in 07055 for a 4 hour project
SCORE    : 1.0000

ONLINE   : Find out how many assists Chris Paul has been averaging in the current season.
MIND2WEB : Find out how many assists Chris Paul has been averaging in the current season.
SCORE    : 1.0000

ONLINE   : Find a permanent job in Logistics within 20 miles of New York, zip 11005, in the middle-income range for a high school diploma holder.
MIND2WEB : Find a permanent job in Logistics within 20 miles of New York, zip 11005, in the m

In [12]:
print("\n❌ NOT MATCHED LIST (score < threshold):")
# 점수 낮은 것부터 보기 좋게 정렬
missing_sorted = sorted(missing, key=lambda x: x[2])

for ot, mt, sc in missing_sorted:
    print("\n---")
    print(f"online : {ot}")
    print(f"best   : {mt}")
    print(f"score  : {sc:.4f}")


❌ NOT MATCHED LIST (score < threshold):

---
online : Complete a multiplication quiz on https://www.coolmath4kids.com/, covering multiplication facts for 11-12. The quiz should consist of 10 questions, with unlimited time allowed for each. The goal is to achieve a perfect score of 10 out of 10.
best   : Locate a large store in Washington that has kids' and maternity products, also check if they have a parking lot, and see the directions of the nearest store.
score  : 0.3523

---
online : My baby boy was born on Dec 1, 2025. His weight is 7 lb 8 oz, height is 20 inches, and head circumference is 35 cm. The measurement date is Dec 1, 2025. Can you calculate his growth percentiles using a calculator?
best   : Register to watch the webcast on developing Career ready student. My name is John Doe. My email is johndoe@gmail. I worked at UAI with a job title of Accounting
score  : 0.3591

---
online : I want to purchase an open-box Samsung Galaxy S25 Plus in excellent condition and trade in a