# Kaggle Test: Product Similarity (NICE)

Notebook này dùng để test repo Product_Similarity trên Kaggle.
- Tự phát hiện repo và dữ liệu từ ` /kaggle/input `.
- Hỗ trợ 2 chế độ: không dùng model (chỉ prompt + retriever) và dùng model HF cục bộ.
- Có thể build lại `nice_chunks.json` từ `data_nice_cls` nếu thiếu.
- Hỗ trợ chạy đơn lẻ và chạy batch từ file CSV.



In [None]:
# Paths: adapt names to your uploaded Kaggle Datasets
import os, sys, json, shutil
from pathlib import Path

# CHANGE THESE if your dataset names are different
REPO_DS_NAME = "product-similarity-repo"   # dataset containing the repo (code files)
DATA_DS_NAME = "product-similarity-data"   # dataset containing data/ and/or data_nice_cls/

# Kaggle input base
KAGGLE_INPUT = Path("/kaggle/input")

# Locate mount points
repo_root = None
for p in KAGGLE_INPUT.glob(f"{REPO_DS_NAME}*"):
    if (p / "product_similarity" / "prompt.py").exists():
        repo_root = p
        break

if repo_root is None:
    raise RuntimeError("Không tìm thấy repo dataset. Đảm bảo bạn đã thêm Input Dataset cho repo.")

# Add repo to sys.path
sys.path.insert(0, str(repo_root))

# Prepare working directory in /kaggle/working
work_dir = Path("/kaggle/working/product_similarity_work")
work_dir.mkdir(parents=True, exist_ok=True)

# Copy repo files into working dir so we can run/modify locally if needed
shutil.copytree(repo_root, work_dir / "repo", dirs_exist_ok=True)
code_root = work_dir / "repo"
print("Repo root:", code_root)

# Locate data dataset (optional if you only want to run no-model without data)
data_root = None
for p in KAGGLE_INPUT.glob(f"{DATA_DS_NAME}*"):
    # Accept if contains data/ or data_nice_cls/
    if (p / "data").exists() or (p / "data_nice_cls").exists():
        data_root = p
        break

print("Data root:", data_root)

# Link/copy data into working repo structure
(target_data := code_root / "data").mkdir(parents=True, exist_ok=True)
(target_cls := code_root / "data_nice_cls").mkdir(parents=True, exist_ok=True)

if data_root is not None:
    if (data_root / "data").exists():
        shutil.copytree(data_root / "data", target_data, dirs_exist_ok=True)
    if (data_root / "data_nice_cls").exists():
        shutil.copytree(data_root / "data_nice_cls", target_cls, dirs_exist_ok=True)

print("Prepared data at:", target_data, target_cls)



In [None]:
# Ensure importable package
import importlib
pkg_path = code_root / "product_similarity"
assert (pkg_path / "__init__.py").exists(), "Missing package files in repo dataset!"

# Put working repo to sys.path first
import sys
sys.path.insert(0, str(code_root))

product_similarity = importlib.import_module("product_similarity")
print("Loaded product_similarity version:", getattr(product_similarity, "__version__", "unknown"))



In [None]:
# Build nice_chunks.json if missing
from pathlib import Path

nice_path = code_root / "data" / "nice_chunks.json"
if not nice_path.exists():
    print("nice_chunks.json missing -> attempt to build from data_nice_cls")
    tools_script = code_root / "tools" / "merge_nice_cls.py"
    if not tools_script.exists():
        raise RuntimeError("merge_nice_cls.py not found in repo/tools")
    import runpy
    runpy.run_path(str(tools_script))
else:
    print("Found:", nice_path)



In [None]:
# Toggle: use model or not
USE_MODEL = False  # set True to use HF model id below
MODEL_ID = "google/flan-t5-base"  # change model as needed
DEVICE = -1  # -1 CPU, 0 GPU

# Optional: provide known NICE class numbers for p1/p2
CLASS_1 = None  # e.g., "3"
CLASS_2 = None  # e.g., "16"

from product_similarity.pipeline import run_similarity



In [None]:
# Single-run example
p1 = "Make-up preparations"
p2 = "Tissues of paper for removing make-up"

res = run_similarity(
    p1,
    p2,
    class_1=CLASS_1,
    class_2=CLASS_2,
    model_name=(MODEL_ID if USE_MODEL else None),
    device=DEVICE,
)

# Show compact summary
print("Scores:", res["scores"])  # may be None values if USE_MODEL=False
print("Contexts:")
for c in res["contexts"]:
    print("-", c.split("\n")[0])

# Truncated prompt preview
print("\nPrompt preview:\n", res["prompt"][:600], "...")



In [None]:
# Batch evaluation from CSV (optional)
# Expect CSV with headers: p1,p2[,class1,class2]
import pandas as pd
from pathlib import Path

# Change this path if your data CSV is elsewhere inside the data dataset
csv_path = code_root / "data" / "100_samples.csv"
if not csv_path.exists():
    print("CSV not found:", csv_path)
else:
    df = pd.read_csv(csv_path)
    # Try to normalize columns
    rename_map = {}
    if df.columns[0] not in ("p1", "P1"):
        rename_map[df.columns[0]] = "p1"
    if df.columns[1] not in ("p2", "P2"):
        rename_map[df.columns[1]] = "p2"
    df = df.rename(columns=rename_map)
    print("Loaded", len(df), "rows")

    out_rows = []
    for i, row in df.iterrows():
        p1 = str(row.get("p1", row.iloc[0]))
        p2 = str(row.get("p2", row.iloc[1]))
        c1 = row.get("class1", CLASS_1)
        c2 = row.get("class2", CLASS_2)
        res = run_similarity(
            p1,
            p2,
            class_1=c1,
            class_2=c2,
            model_name=(MODEL_ID if USE_MODEL else None),
            device=DEVICE,
        )
        out_rows.append({
            "p1": p1,
            "p2": p2,
            "class1": c1,
            "class2": c2,
            "nature": res["scores"].get("nature"),
            "purpose": res["scores"].get("purpose"),
            "overall": res["scores"].get("overall"),
        })
    out_df = pd.DataFrame(out_rows)
    out_path = Path("/kaggle/working/batch_results.csv")
    out_df.to_csv(out_path, index=False)
    print("Saved:", out_path)

