In [None]:
import os, sys, json, subprocess
from pathlib import Path

# Repo cố định trên Kaggle
repo_dir = Path('/kaggle/input/nature-score-predict')
if not repo_dir.exists():
    repo_dir = Path('.')
print(f'Repo dir: {repo_dir}')

# Cài dependencies (nếu có requirements.txt)
req_path = repo_dir / 'requirements.txt'
if req_path.exists():
    subprocess.run(['pip', 'install', '-q', '-r', str(req_path)], check=False)

# Chat API cần thư viện openai
subprocess.run(['pip', 'install', '-q', 'openai'], check=False)

# Thêm vào PYTHONPATH và import evaluate_dataset
sys.path.insert(0, str(repo_dir.resolve()))
from eval import evaluate_dataset


In [None]:
# Test 1 mẫu (single sample)
USE_ANALYZER_MODEL = None  # 'google/flan-t5-base' hoặc None để bỏ qua analyzer
AGENT_MODEL_ID    = 'mistralai/Mistral-7B-Instruct-v0.2'
DEVICE            = -1
MAX_NEW_TOKENS    = 256

# NVIDIA Chat API cấu hình trực tiếp
USE_CHAT_API      = True
CHAT_API_BASE_URL = 'https://integrate.api.nvidia.com/v1'
CHAT_API_MODEL    = 'nvidia/nemotron-nano-12b-v2-vl'
CHAT_API_KEY      = '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'

# Tạo CSV tạm 1 dòng
from pathlib import Path
import csv
from product_similarity.pipeline import run_similarity

p1_text = 'Make-up preparations'
p2_text = 'Tissues of paper for removing make-up'

one_csv = Path('/kaggle/working/one_sample.csv')
one_csv.parent.mkdir(parents=True, exist_ok=True)
with one_csv.open('w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['Item 1', 'Item 2', 'Level of similarity'])
    writer.writerow([
        p1_text,
        p2_text,
        '-',  # không có nhãn
    ])

res = evaluate_dataset(
    str(one_csv),
    model_name=USE_ANALYZER_MODEL,
    agent_model=AGENT_MODEL_ID,
    chat_api_base_url=CHAT_API_BASE_URL if USE_CHAT_API else None,
    chat_api_key=CHAT_API_KEY if USE_CHAT_API else None,
    chat_api_model=CHAT_API_MODEL if USE_CHAT_API else None,
    device=DEVICE,
    max_new_tokens=MAX_NEW_TOKENS,
    include_spsc=True,
    spsc_top_k=2,
)

print(json.dumps(res['metrics'], ensure_ascii=False, indent=2))
first = res['results'][0]
print({'pred_overall': first.get('pred_overall'), 'gold_overall': first.get('gold_overall')})

# In thêm PROMPT và OUTPUT đầy đủ bằng pipeline trực tiếp
single_run = run_similarity(
    p1_text,
    p2_text,
    max_fewshot=2,
    top_k=3,
    include_spsc=True,
    spsc_top_k=2,
    chat_api_base_url=CHAT_API_BASE_URL if USE_CHAT_API else None,
    chat_api_key=CHAT_API_KEY if USE_CHAT_API else None,
    chat_api_model=CHAT_API_MODEL if USE_CHAT_API else None,
    model_name=USE_ANALYZER_MODEL,
    device=DEVICE,
    max_new_tokens=MAX_NEW_TOKENS,
)

print('\n=== PROMPT ===\n')
print(single_run.get('prompt', ''))
print('\n=== OUTPUT ===\n')
print(single_run.get('output_text', ''))


In [None]:
# Test batch từ CSV (ưu tiên 75_samples.csv)
from pathlib import Path

# Tìm data CSV trong repo
csv_path = None
for cand in [
    repo_dir / 'data' / '75_samples.csv',
    repo_dir / 'data' / '100_samples.csv',
]:
    if cand.exists():
        csv_path = cand
        break

if csv_path is None:
    raise FileNotFoundError('Không tìm thấy data CSV (75_samples.csv hoặc 100_samples.csv) trong repo inputs.')

print(f'Using CSV: {csv_path}')

out = evaluate_dataset(
    str(csv_path),
    model_name=USE_ANALYZER_MODEL,
    agent_model=AGENT_MODEL_ID,
    chat_api_base_url=CHAT_API_BASE_URL if USE_CHAT_API else None,
    chat_api_key=CHAT_API_KEY if USE_CHAT_API else None,
    chat_api_model=CHAT_API_MODEL if USE_CHAT_API else None,
    device=DEVICE,
    max_new_tokens=MAX_NEW_TOKENS,
    include_spsc=True,
    spsc_top_k=2,
)

print(json.dumps(out['metrics'], ensure_ascii=False, indent=2))

# Lưu kết quả gọn ra /kaggle/working
rows = []
for r in out['results']:
    rows.append({
        'p1': r.get('product_1'),
        'p2': r.get('product_2'),
        'pred': r.get('pred_overall'),
        'label': r.get('gold_overall'),
    })

import pandas as pd
pd.DataFrame(rows).to_csv('/kaggle/working/batch_results.csv', index=False)
with open('/kaggle/working/metrics.json', 'w', encoding='utf-8') as f:
    json.dump(out['metrics'], f, ensure_ascii=False, indent=2)

print('Saved:', '/kaggle/working/batch_results.csv', '/kaggle/working/metrics.json')
