In [11]:
import json

with open("website_to_subdomain.json", "r", encoding="utf-8") as f:
    online_data = json.load(f)

unique_values = set(online_data.values())
print("unique 개수:", len(unique_values))
print("unique values:", unique_values)

unique 개수: 30
unique values: {'Sports', 'Department', 'Ground', 'Government', 'Auto', 'Home service', 'Digital', 'Game', 'Education', 'Restaurant', 'Cooking', 'Car rental', 'Health', 'Hotel', 'General', 'Finance', 'Shipping', 'Pet', 'Music', 'Speciality', 'Fashion', 'Airlines', 'Housing', 'Event', 'Movie', 'Moving', 'Social media', 'Weather', 'Other', 'Job'}


In [6]:
import json
import re

# -----------------------
# Paths
# -----------------------
ONLINE_PATH = "online_mind2web/Online_Mind2Web.json"
MAP_PATH = "website_to_subdomain.json"
OUTPUT_PATH = "online_mind2web/Online_Mind2Web_with_subdomain.json"

# -----------------------
# 1. Load files
# -----------------------
with open(ONLINE_PATH, "r", encoding="utf-8") as f:
    online_data = json.load(f)

with open(MAP_PATH, "r", encoding="utf-8") as f:
    website_to_subdomain = json.load(f)

# -----------------------
# 2. website name extractor
# -----------------------
def extract_core_website(url: str) -> str | None:
    """
    Extract string between 'www.' and '.com'
    """
    if not url:
        return None

    match = re.search(r"www\.([^.]+)\.com", url)
    return match.group(1) if match else None


# -----------------------
# 3. Attach sub_domain
# -----------------------
unmatched = []

for item in online_data:
    raw_website = item.get("website", "")
    core_name = extract_core_website(raw_website)

    if core_name is None:
        item["sub_domain"] = None
        unmatched.append(raw_website)
        continue

    # website_to_subdomain.json 은 보통 전체 URL 기준이므로
    # key 중에 core_name 이 포함된 것을 찾음
    matched_sd = None
    for full_url, sd in website_to_subdomain.items():
        if core_name in full_url:
            matched_sd = sd
            break

    item["sub_domain"] = matched_sd

    if matched_sd is None:
        unmatched.append(raw_website)

# -----------------------
# 4. Save output
# -----------------------
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    json.dump(online_data, f, ensure_ascii=False, indent=2)

print(f"Saved to {OUTPUT_PATH}")
print(f"Unmatched websites: {len(unmatched)}")


Saved to online_mind2web/Online_Mind2Web_with_subdomain.json
Unmatched websites: 147


In [12]:
import json
from openai import OpenAI

client = OpenAI()
# Online mind2web에 처음부터 subdomain입히기

INPUT_PATH = "/mnt/raid5/parksh/Mind2web/online_mind2web/Online_Mind2Web.json"
OUTPUT_PATH = "/mnt/raid5/parksh/Mind2web/online_mind2web/Online_Mind2Web_with_subdomain_llm_v2.json"

MODEL_NAME = "gpt-5-mini"

with open(INPUT_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)

SYSTEM_PROMPT = '''You are an expert dataset annotator.

Given a website and a task description, infer the single most appropriate high-level subdomain category.

Return ONLY the subdomain name as a short noun phrase.
Do NOT include explanations, punctuation, or multiple labels.

Prefer assigning a label from the following list when applicable, but you are not strictly limited to it:
Sports, Department, Government, Auto, Home Service, Digital, Game, Education, Restaurant, Cooking,
Car Rental, Health, Hotel, Finance, Shipping, Pet, Music, Fashion, Airlines, Housing,
Event, Movie, Moving, Social Media, Weather, Job, Other.
'''

def build_user_prompt(website: str, task: str) -> str:
    return f"""
Website:
{website}

Task:
{task}

Question:
Which subdomain does this website belong to?
"""

def infer_subdomain_with_llm(website: str, task: str) -> str:
    response = client.chat.completions.create(
        model=MODEL_NAME,
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": build_user_prompt(website, task)},
        ],
    )

    return response.choices[0].message.content.strip()


In [13]:
for item in data:
    website = item.get("website", "")
    task = item.get("confirmed_task", "")

    try:
        inferred_sd = infer_subdomain_with_llm(website, task)
    except Exception as e:
        print("LLM error:", e)
        inferred_sd = None

    item["sub_domain"] = inferred_sd
    item["with_llm"] = 1

with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print(f"Saved to {OUTPUT_PATH}")

llm_count = sum(1 for x in data if x.get("with_llm") == 1)
print("LLM으로 채운 row 수:", llm_count)


Saved to /mnt/raid5/parksh/Mind2web/online_mind2web/Online_Mind2Web_with_subdomain_llm_v2.json
LLM으로 채운 row 수: 300


In [4]:
import json
from collections import defaultdict, Counter
from pathlib import Path

FILE_PATH = "online_mind2web/Online_Mind2Web_with_subdomain_llm_v2.json"

def load_json_or_jsonl(path: str):
    """JSON(list/dict) 또는 JSONL(list of json objects) 모두 로드"""
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(f"File not found: {path}")

    # 1) JSON로 먼저 시도
    try:
        with p.open("r", encoding="utf-8") as f:
            data = json.load(f)
        # 흔한 케이스: list[dict]
        if isinstance(data, list):
            return data
        # dict 형태면 내부에 list가 들어있을 수도 있으니 최대한 안전하게 처리
        if isinstance(data, dict):
            # "data" / "items" 같은 키에 리스트가 있는 경우를 우선 처리
            for k in ["data", "items", "examples", "records"]:
                if k in data and isinstance(data[k], list):
                    return data[k]
            # 아니면 dict의 value들 중 dict들이면 flatten 시도(비권장이지만 안전망)
            vals = list(data.values())
            if vals and all(isinstance(v, dict) for v in vals):
                return vals
            raise ValueError("Loaded JSON is a dict, but cannot find a list of records inside.")
    except json.JSONDecodeError:
        pass  # JSON 실패 -> JSONL 시도


def get_key(record: dict, candidates):
    """여러 후보 키 중 존재하는 첫 키를 반환"""
    for k in candidates:
        if k in record:
            return record.get(k)
    return None

def normalize(v):
    if v is None:
        return None
    if isinstance(v, str):
        v = v.strip()
        return v if v else None
    return v

def main():
    data = load_json_or_jsonl(FILE_PATH)
    print(f"Loaded records: {len(data)}")

    # subdomain -> website counter
    subdomain_to_websites = defaultdict(Counter)

    missing_sd = 0
    missing_web = 0

    for rec in data:
        if not isinstance(rec, dict):
            continue

        sd = normalize(get_key(rec, ["sub_domain", "subdomain", "subDomain"]))
        web = normalize(get_key(rec, ["website", "site", "domain"]))

        if sd is None:
            missing_sd += 1
            sd = "__NULL_SUBDOMAIN__"
        if web is None:
            missing_web += 1
            web = "__NULL_WEBSITE__"

        subdomain_to_websites[sd][web] += 1

    # 1) subdomain별 전체 개수
    subdomain_total = {sd: sum(counter.values()) for sd, counter in subdomain_to_websites.items()}
    # 총합 기준 내림차순 정렬
    sorted_subdomains = sorted(subdomain_total.items(), key=lambda x: x[1], reverse=True)

    print("\n==============================")
    print("Subdomain totals (desc)")
    print("==============================")
    for sd, total in sorted_subdomains:
        n_websites = len(subdomain_to_websites[sd])
        print(f"- {sd}: {total} records  |  {n_websites} unique websites")

    # 2) 각 subdomain 안에서 website별 개수
    print("\n==============================")
    print("Website counts within each subdomain")
    print("(showing websites in desc count order)")
    print("==============================")
    for sd, total in sorted_subdomains:
        print(f"\n[{sd}] total={total}")
        web_counter = subdomain_to_websites[sd]
        for web, cnt in web_counter.most_common():
            print(f"  - {web}: {cnt}")

    print("\n==============================")
    print("Missing key stats")
    print("==============================")
    print(f"Missing sub_domain: {missing_sd}")
    print(f"Missing website:    {missing_web}")

if __name__ == "__main__":
    main()


Loaded records: 289

Subdomain totals (desc)
- Health: 32 records  |  10 unique websites
- Government: 25 records  |  11 unique websites
- Digital: 24 records  |  12 unique websites
- Other: 20 records  |  19 unique websites
- Auto: 18 records  |  8 unique websites
- Pet: 18 records  |  3 unique websites
- Shopping: 17 records  |  13 unique websites
- Housing: 17 records  |  6 unique websites
- Job: 15 records  |  10 unique websites
- Shipping: 12 records  |  4 unique websites
- Education: 12 records  |  11 unique websites
- Sports: 11 records  |  7 unique websites
- Weather: 11 records  |  3 unique websites
- Finance: 11 records  |  7 unique websites
- Transportation: 10 records  |  8 unique websites
- Travel: 10 records  |  7 unique websites
- Game: 7 records  |  5 unique websites
- Cooking: 5 records  |  3 unique websites
- Movie: 4 records  |  3 unique websites
- Event: 4 records  |  3 unique websites
- Music: 3 records  |  3 unique websites
- Social Media: 3 records  |  3 unique w

In [None]:
# auto + transportation = transportation
# digital + shopping = shopping
# game + movie + event + music + social media = entertainment