In [3]:
pip install rapidfuzz

Collecting rapidfuzz
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.14.3


In [4]:
import json
from pathlib import Path

import pandas as pd
import torch
from rapidfuzz import fuzz
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

In [None]:
MODEL_ID = "Qwen/Qwen2.5-3B-Instruct"
DATA_PATH = Path("/content/pov_to_vv_with_ids.csv")
OUTPUT_CSV = Path("/content/llm_judgements.csv")

In [25]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=False, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
)

SYSTEM_PROMPT = (
    "You are a neutral judge deciding whether two catalog names refer to the same underlying product."
    "Treat entries as MATCH when one is a generic category and the other is a specific brand/flavor/"
    "package that still belongs to that category. Only label MISMATCH when the core product type differs"
    "(e.g., meat vs fruit, fish vs spice). Use UNCERTAIN only if information is insufficient."
    "Ignore differences in brand, flavor, packaging, format, or granularity when they describe the same"
    "ingredient or product family (e.g., 'Snacks' vs 'Corn snacks Cheetos ketchup' = MATCH)."
    "Respond strictly with JSON: {\"match_label\": \"match|mismatch|uncertain\", "
    "\"confidence\": float 0..1, \"reason\": \"≤50 words\"}."
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
model.device

device(type='cuda', index=0)

In [22]:
USER_TEMPLATE = """Catalog A (Povarenok): "{pov_name}"

Catalog B (VkusVill): "{vv_name}"

Question: Do these refer to the same underlying product type? Focus on semantics (ingredient/category)
and ignore brand or packaging details when the core product stays the same. Respond with JSON only."""

def judge_pair(pov_name: str, vv_name: str) -> dict:
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_TEMPLATE.format(pov_name=pov_name, vv_name=vv_name)},
    ]
    model_inputs = tokenizer.apply_chat_template(
        messages, return_tensors="pt", padding=True
    ).to(model.device)

    with torch.no_grad():
        output = model.generate(
            model_inputs,
            max_new_tokens=96,
            temperature=0.2,
            top_p=0.9,
            do_sample=False,
        )

    decoded = tokenizer.decode(
        output[0][model_inputs.shape[-1]:], skip_special_tokens=True
        ).strip()

    json_start = decoded.find("{")
    if json_start != -1:
        decoded = decoded[json_start:]

    try:
        parsed = json.loads(decoded)
    except json.JSONDecodeError:
        parsed = {
            "match_label": "uncertain",
            "confidence": 0.0,
            "reason": f"Parse error: {decoded}",
        }
    return parsed

In [31]:
df = pd.read_csv(DATA_PATH)

judgements = []
for row in tqdm(df.itertuples(index=False), total=len(df), desc="LLM judging (sample)"):
    pov = row.name_product_povarenok
    vv = row.name_product_vkusvill
    verdict = judge_pair(pov, vv)
    verdict["id_povarenok"] = row.id_povarenok
    verdict["id_vkusvill"] = row.id_vkusvill
    verdict["name_product_povarenok"] = pov
    verdict["name_product_vkusvill"] = vv
    verdict["fuzz_ratio"] = fuzz.token_set_ratio(pov, vv)
    judgements.append(verdict)

judgement_df = pd.DataFrame(judgements)
judgement_df.to_csv(OUTPUT_CSV, index=False)
display(judgement_df.head())

LLM judging (sample): 100%|██████████| 974/974 [47:22<00:00,  2.92s/it]


Unnamed: 0,match_label,confidence,reason,id_povarenok,id_vkusvill,name_product_povarenok,name_product_vkusvill,fuzz_ratio
0,match,0.95,Both products are categorized under 'ovoshchna...,636.0,10274.0,Икра овощная,"Закуска овощная ""Аджапсандал"" низкокалорийная",73.684211
1,match,0.9,"Both products are citrus fruits, regardless of...",537.0,52.0,Лимонник,Апельсины Навелин,24.0
2,match,1.0,"Both products are Bulgarian red pepper, ignori...",248.0,6313.0,Перец болгарский красный,"Перец болгарский красный ""ВВ Отборное""",100.0
3,match,0.9,"Both refer to meat, which is the same core pro...",146.0,5123.0,Мясо,Мясо криля натуральное,100.0
4,uncertain,0.5,Information about brand and specific flavor do...,511.0,223.0,Пахта,Айва,22.222222


In [32]:
metrics = {}
total = len(judgement_df)

for label in ["match", "mismatch", "uncertain"]:
    mask = judgement_df["match_label"] == label
    metrics[f"rate_{label}"] = mask.mean()
    metrics[f"avg_confidence_{label}"] = (
        judgement_df.loc[mask, "confidence"].mean() if mask.any() else float("nan")
    )

metrics["avg_confidence_all"] = judgement_df["confidence"].mean()

print("LLM-based evaluation metrics:")
for k, v in metrics.items():
    print(f"{k}: {v:.3f}")

LLM-based evaluation metrics:
rate_match: 0.802
avg_confidence_match: 0.950
rate_mismatch: 0.066
avg_confidence_mismatch: 0.908
rate_uncertain: 0.132
avg_confidence_uncertain: 0.426
avg_confidence_all: 0.878
