# Kaggle Test: Product Similarity (Multi-agent + Judge)

Notebook này chạy pipeline mới:
- Analyzer (prompt + few-shot) giữ nguyên từ `product_similarity`.
- Multi-agent theo tiêu chí (Nature, Intended Purpose, Channel of trade, ...).
- Judge gộp điểm các tiêu chí để ra Overall Similarity.

Tự động phát hiện repo/data từ `/kaggle/input`, có thể chạy:
- Không model (chỉ build prompt) hoặc dùng HF local/Chat API.
- Chạy đơn lẻ một case hoặc batch từ CSV (`data/75_samples.csv` hoặc `data/100_samples.csv`).


In [None]:
# Paths: adapt names to your uploaded Kaggle Datasets
import os, sys, json, shutil
from pathlib import Path

# CHANGE THESE if your dataset names are different
REPO_DS_NAME = "product-similarity-scorer"   # dataset containing the repo (code files)
DATA_DS_NAME = "products-similarity-scorer-data"   # dataset containing data/ and/or data_nice_cls/

# Kaggle input base
KAGGLE_INPUT = Path("/kaggle/input")

# Locate mount points
repo_root = None
for p in KAGGLE_INPUT.glob(f"{REPO_DS_NAME}*"):
    if (p / "product_similarity" / "prompt.py").exists():
        repo_root = p
        break

if repo_root is None:
    raise RuntimeError("Không tìm thấy repo dataset. Đảm bảo bạn đã thêm Input Dataset cho repo.")

# Add repo to sys.path
sys.path.insert(0, str(repo_root))

# Prepare working directory in /kaggle/working
work_dir = Path("/kaggle/working/product_similarity_work")
work_dir.mkdir(parents=True, exist_ok=True)

# Copy repo files into working dir so we can run/modify locally if needed
shutil.copytree(repo_root, work_dir / "repo", dirs_exist_ok=True)
code_root = work_dir / "repo"
print("Repo root:", code_root)

# Locate data dataset (optional if you only want to run no-model without data)
data_root = Path(KAGGLE_INPUT / DATA_DS_NAME)

print("Data root:", data_root)

# Link/copy data into working repo structure
(target_data := code_root / "data").mkdir(parents=True, exist_ok=True)
# (target_cls := code_root / "data_nice_cls").mkdir(parents=True, exist_ok=True)

if data_root is not None:
    shutil.copytree(data_root, target_data, dirs_exist_ok=True)
    # print("-" * 20)
    # print(f"📁 Các file trong thư mục đích '{target_data}':")
    # for root, dirs, files in os.walk(target_data):
    #     for name in files:
    #         # Tạo đường dẫn đầy đủ và in ra
    #         file_path = os.path.join(root, name)
    #         print(file_path)
    # print("-" * 20)
        
    # if (data_root / "data_nice_cls").exists():
    #     shutil.copytree(data_root / "data_nice_cls", target_cls, dirs_exist_ok=True)

print("Prepared data at:", target_data)



In [None]:
# Ensure importable package
import importlib
pkg_path = code_root / "product_similarity"
assert (pkg_path / "__init__.py").exists(), "Missing package files in repo dataset!"

# Put working repo to sys.path first
import sys
sys.path.insert(0, str(code_root))

product_similarity = importlib.import_module("product_similarity")
print("Loaded product_similarity version:", getattr(product_similarity, "__version__", "unknown"))



In [None]:
# Build nice_chunks.json if missing
from pathlib import Path

nice_path = code_root / "data" / "nice_chunks.json"
if not nice_path.exists():
    print("nice_chunks.json missing -> attempt to build from data_nice_cls")
    tools_script = code_root / "tools" / "merge_nice_cls.py"
    if not tools_script.exists():
        raise RuntimeError("merge_nice_cls.py not found in repo/tools")
    import runpy
    runpy.run_path(str(tools_script))
else:
    print("Found:", nice_path)



In [None]:
# Optional: set NV_API_KEY here if not using Kaggle Secrets
import os
if "NV_API_KEY" not in os.environ:
	# os.environ["NV_API_KEY"] = "YOUR_KEY_HERE"  # uncomment to set manually
	pass

In [None]:
# Toggles
USE_ANALYZER_MODEL = None  # e.g., "google/flan-t5-base" or None to skip analyzer generation
AGENT_MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.2"  # multi-agent default model
DEVICE = -1  # -1 CPU, 0 GPU
MAX_NEW_TOKENS = 256

# Use OpenAI-compatible Chat API (e.g., NVIDIA) (applies to both analyzer and agents if provided)
USE_CHAT_API = False
CHAT_API_BASE_URL = "https://integrate.api.nvidia.com/v1"
CHAT_API_MODEL = "meta/llama-3.1-8b-instruct"
# Set Kaggle secret NV_API_KEY in notebook Settings -> Add-ons -> Secrets
CHAT_API_KEY = os.environ.get("NV_API_KEY")

# Optional: provide known NICE class numbers for p1/p2
CLASS_1 = None  # e.g., "3" or None
CLASS_2 = None # e.g., "16" or None

from product_similarity.pipeline import run_similarity
from eval import evaluate_dataset



In [None]:
# Single-run example (Analyzer + Agents + Judge via evaluate_dataset on 1 row)
p1 = "chemical products used in the manufacture of plastics and in the photocopying industry"
p2 = "chemical additives for detergents"

# Build a tiny in-memory CSV-like evaluation
import pandas as _pd
_tmp_csv = "/kaggle/working/_single_case.csv"
_pd.DataFrame([
    {"Item 1": p1, "Item 2": p2, "Level of similarity": 4}
]).to_csv(_tmp_csv, index=False)

out = evaluate_dataset(
    _tmp_csv,
    model_name=(USE_ANALYZER_MODEL or None),
    agent_model=AGENT_MODEL_ID,
    chat_api_base_url=(CHAT_API_BASE_URL if USE_CHAT_API else None),
    chat_api_key=(CHAT_API_KEY if USE_CHAT_API else None),
    chat_api_model=(CHAT_API_MODEL if USE_CHAT_API else None),
    device=DEVICE,
    max_new_tokens=MAX_NEW_TOKENS,
)

print("Metrics:", out["metrics"])
print("Pred overall:", out["results"][0]["pred_overall"])


In [None]:
# print("\nPrompt preview:\n", res["prompt"])

In [None]:
# Batch evaluation from CSV (optional) using multi-agent judge
import pandas as pd
import numpy as np
from pathlib import Path
import json as _json

# Prefer 75_samples if available, else fall back to 100_samples
csv_75 = code_root / "data" / "75_samples.csv"
csv_100 = code_root / "data" / "100_samples.csv"
csv_path = csv_75 if csv_75.exists() else csv_100

if not csv_path.exists():
    print("CSV not found:", csv_path)
    data_prepared = False
else:
    df = pd.read_csv(csv_path)
    label_col = next((c for c in df.columns if c.strip().lower() == "level of similarity"), None)
    print("Loaded", len(df), "rows from:", csv_path)
    data_prepared = True

In [None]:
if data_prepared:
    # Leverage evaluate_dataset for the whole CSV
    out = evaluate_dataset(
        str(csv_path),
        model_name=(USE_ANALYZER_MODEL or None),
        agent_model=AGENT_MODEL_ID,
        chat_api_base_url=(CHAT_API_BASE_URL if USE_CHAT_API else None),
        chat_api_key=(CHAT_API_KEY if USE_CHAT_API else None),
        chat_api_model=(CHAT_API_MODEL if USE_CHAT_API else None),
        device=DEVICE,
        max_new_tokens=MAX_NEW_TOKENS,
    )

    # Flatten results for convenience
    out_rows = []
    for r in out["results"]:
        out_rows.append({
            "p1": r["product_1"],
            "p2": r["product_2"],
            "pred": r["pred_overall"],
            "label": r.get("gold_overall"),
        })

    out_df = pd.DataFrame(out_rows)

    # Metrics
    metrics = out["metrics"]

    # Save outputs
    out_path = Path("/kaggle/working/batch_results.csv")
    out_df.to_csv(out_path, index=False)

    metrics_path = Path("/kaggle/working/metrics.json")
    metrics_path.write_text(_json.dumps(metrics, ensure_ascii=False, indent=2), encoding="utf-8")

    print("Saved:", out_path)
    print("Metrics:", metrics)