# Kaggle Test: Product Similarity (NICE)

Notebook này dùng để test repo Product_Similarity trên Kaggle.
- Tự phát hiện repo và dữ liệu từ ` /kaggle/input `.
- Hỗ trợ 2 chế độ: không dùng model (chỉ prompt + retriever) và dùng model HF cục bộ.
- Có thể build lại `nice_chunks.json` từ `data_nice_cls` nếu thiếu.
- Hỗ trợ chạy đơn lẻ và chạy batch từ file CSV.



In [None]:
# Paths: adapt names to your uploaded Kaggle Datasets
import os, sys, json, shutil
from pathlib import Path

# CHANGE THESE if your dataset names are different
REPO_DS_NAME = "product-similarity-scorer"   # dataset containing the repo (code files)
DATA_DS_NAME = "products-similarity-scorer-data"   # dataset containing data/ and/or data_nice_cls/

# Kaggle input base
KAGGLE_INPUT = Path("/kaggle/input")

# Locate mount points
repo_root = None
for p in KAGGLE_INPUT.glob(f"{REPO_DS_NAME}*"):
    if (p / "product_similarity" / "prompt.py").exists():
        repo_root = p
        break

if repo_root is None:
    raise RuntimeError("Không tìm thấy repo dataset. Đảm bảo bạn đã thêm Input Dataset cho repo.")

# Add repo to sys.path
sys.path.insert(0, str(repo_root))

# Prepare working directory in /kaggle/working
work_dir = Path("/kaggle/working/product_similarity_work")
work_dir.mkdir(parents=True, exist_ok=True)

# Copy repo files into working dir so we can run/modify locally if needed
shutil.copytree(repo_root, work_dir / "repo", dirs_exist_ok=True)
code_root = work_dir / "repo"
print("Repo root:", code_root)

# Locate data dataset (optional if you only want to run no-model without data)
data_root = Path(KAGGLE_INPUT / DATA_DS_NAME)

print("Data root:", data_root)

# Link/copy data into working repo structure
(target_data := code_root / "data").mkdir(parents=True, exist_ok=True)
# (target_cls := code_root / "data_nice_cls").mkdir(parents=True, exist_ok=True)

if data_root is not None:
    shutil.copytree(data_root, target_data, dirs_exist_ok=True)
    # print("-" * 20)
    # print(f"📁 Các file trong thư mục đích '{target_data}':")
    # for root, dirs, files in os.walk(target_data):
    #     for name in files:
    #         # Tạo đường dẫn đầy đủ và in ra
    #         file_path = os.path.join(root, name)
    #         print(file_path)
    # print("-" * 20)
        
    # if (data_root / "data_nice_cls").exists():
    #     shutil.copytree(data_root / "data_nice_cls", target_cls, dirs_exist_ok=True)

print("Prepared data at:", target_data)



In [None]:
# Ensure importable package
import importlib
pkg_path = code_root / "product_similarity"
assert (pkg_path / "__init__.py").exists(), "Missing package files in repo dataset!"

# Put working repo to sys.path first
import sys
sys.path.insert(0, str(code_root))

product_similarity = importlib.import_module("product_similarity")
print("Loaded product_similarity version:", getattr(product_similarity, "__version__", "unknown"))



In [None]:
# Build nice_chunks.json if missing
from pathlib import Path

nice_path = code_root / "data" / "nice_chunks.json"
if not nice_path.exists():
    print("nice_chunks.json missing -> attempt to build from data_nice_cls")
    tools_script = code_root / "tools" / "merge_nice_cls.py"
    if not tools_script.exists():
        raise RuntimeError("merge_nice_cls.py not found in repo/tools")
    import runpy
    runpy.run_path(str(tools_script))
else:
    print("Found:", nice_path)



In [None]:
os.environ["NV_API_KEY"] = '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'

In [None]:
# Toggles
USE_MODEL = False  # set True to use HF model id below
MODEL_ID = "google/flan-t5-base"  # change model as needed
DEVICE = 0  # -1 CPU, 0 GPU

# Use OpenAI-compatible Chat API (e.g., NVIDIA) instead of HF
USE_CHAT_API = True
CHAT_API_BASE_URL = "https://integrate.api.nvidia.com/v1"
CHAT_API_MODEL = "nvidia/nvidia-nemotron-nano-9b-v2"
# Set Kaggle secret NV_API_KEY in notebook Settings -> Add-ons -> Secrets
CHAT_API_KEY = os.environ.get("NV_API_KEY")

# Optional: provide known NICE class numbers for p1/p2
CLASS_1 = '1'  # e.g., "3" None
CLASS_2 = '1' # e.g., "16" None

from product_similarity.pipeline import run_similarity



In [None]:
# # Single-run example
# p1 = "chemical products used in the manufacture of plastics and in the photocopying industry"
# p2 = "chemical additives for detergents"


# # HF model
# # res = run_similarity(
# #     p1,
# #     p2,
# #     class_1=CLASS_1,
# #     class_2=CLASS_2,
# #     model_name=(MODEL_ID if USE_MODEL else None),
# #     device=DEVICE,
# # )

# # NV Model
# res = run_similarity(
#     p1,
#     p2,
#     class_1=CLASS_1,
#     class_2=CLASS_2,
#     model_name=(MODEL_ID if (USE_MODEL and not USE_CHAT_API) else None),
#     chat_api_base_url=(CHAT_API_BASE_URL if USE_CHAT_API else None),
#     chat_api_key=(CHAT_API_KEY if USE_CHAT_API else None),
#     chat_api_model=(CHAT_API_MODEL if USE_CHAT_API else None),
#     device=DEVICE,
#     max_new_tokens=2048,
#     temperature=0.6,
#     top_p=0.95,
# )

# # Show compact summary
# print("Scores:", res["scores"])  # may be None values if USE_MODEL=False
# print("Contexts:")
# for c in res["contexts"]:
#     print("-", c.split("\n")[0])

# # Truncated prompt preview




In [None]:
# print("\nPrompt preview:\n", res["prompt"])

In [None]:
# Batch evaluation from CSV (optional)
# Expect CSV with headers: p1,p2[,class1,class2]
import pandas as pd
import numpy as np
from pathlib import Path
import json as _json

# Prefer 75_samples if available, else fall back to 100_samples
csv_75 = code_root / "data" / "75_samples.csv"
csv_100 = code_root / "data" / "100_samples.csv"
csv_path = csv_75 if csv_75.exists() else csv_100

if not csv_path.exists():
    print("CSV not found:", csv_path)
else:
    df = pd.read_csv(csv_path)
    label_col = next((c for c in df.columns if c.strip().lower() == "level of similarity"), None)
    print("Loaded", len(df), "rows from:", csv_path)
    data_prepared = True

In [None]:
if data_prepared:
    out_rows = []
    for i, row in df.iterrows():
        p1 = str(row.get("Item1", row.iloc[0]))
        p2 = str(row.get("Item2", row.iloc[1]))
        c1 = row.get("class1", CLASS_1)
        c2 = row.get("class2", CLASS_2)

        res = run_similarity(
            p1,
            p2,
            class_1=c1,
            class_2=c2,
            model_name=(MODEL_ID if (USE_MODEL and not USE_CHAT_API) else None),
            chat_api_base_url=(CHAT_API_BASE_URL if USE_CHAT_API else None),
            chat_api_key=(CHAT_API_KEY if USE_CHAT_API else None),
            chat_api_model=(CHAT_API_MODEL if USE_CHAT_API else None),
            device=DEVICE,
        )
        out = {
            "p1": p1,
            "p2": p2,
            "class1": c1,
            "class2": c2,
            "nature": res["scores"].get("nature"),
            "purpose": res["scores"].get("purpose"),
            "overall": res["scores"].get("overall"),
        }

        if label_col and label_col in row:
            out["label"] = row[label_col]
        out_rows.append(out)
        
    out_df = pd.DataFrame(out_rows)

    # Create explicit 'pred' column from 'overall' (nullable Int)
    out_df["pred"] = pd.to_numeric(out_df["overall"], errors="coerce").astype("Int64")

    # Metrics based on label (Level of similarity: integer 0..4)
    if "label" in out_df.columns:
        valid = out_df["overall"].notna() & out_df["label"].notna()
        n_eval = int(valid.sum())
        if n_eval > 0:
            pred = out_df.loc[valid, "overall"].astype(int)
            label = out_df.loc[valid, "label"].astype(int)
            accuracy_exact = float((pred == label).mean())
            mse = float(np.mean((pred - label) ** 2))
        else:
            accuracy_exact = 0.0
            mse = None
    else:
        accuracy_exact = None
        mse = None
        n_eval = 0

    # Save outputs
    out_path = Path("/kaggle/working/batch_results.csv")
    out_df.to_csv(out_path, index=False)

    metrics = {"accuracy_exact": accuracy_exact, "mse": mse, "n_eval": n_eval}
    metrics_path = Path("/kaggle/working/metrics.json")
    metrics_path.write_text(_json.dumps(metrics, ensure_ascii=False, indent=2), encoding="utf-8")

    print("Saved:", out_path)
    print("Metrics:", metrics)