In [None]:
from dataclasses import dataclass
import re
import numpy as np
import pandas as pd
import faiss
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
from typing import List, Dict, Iterable, Sequence, Tuple, Optional, Set
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from rapidfuzz import fuzz
import json
from pathlib import Path

In [None]:
pov_df = pd.read_csv("recipes_normalized-4.csv", encoding="utf-8")
pov_df

In [None]:
import ast
ingredients_unique = set()

for ingredients_str in pov_df['ingredients_normalized'].dropna():
    ingredients_dict = ast.literal_eval(ingredients_str)
    ingredients_unique.update(ingredients_dict.keys())

In [None]:
pov_ingr_list = pd.DataFrame(ingredients_unique)
pov_ingr_list.columns = ["ingredient"]
pov_ingr_list

Unnamed: 0,ingredient
0,Икра овощная
1,Лимонник
2,Перец болгарский красный
3,Мясо
4,Пахта
...,...
974,Желудь
975,Кунжутные семечки
976,Сырок плавленый
977,Черемша


Всего 979 обработанных ингредиента в датасете Поваренка

Далее реализованы утилиты для матчинга: токенизация строк (буквенно-цифровые, в нижнем регистре), сбор служебного текста, Jaccard по токенам, поиск целого слова, нормализация запроса

In [None]:
_TOKEN_RE = re.compile(r"[A-Za-zА-Яа-яЁё0-9]+", re.UNICODE)

def tokenize_lower(text: str) -> List[str]:
    if not isinstance(text, str):
        return []
    return [t.lower() for t in _TOKEN_RE.findall(text)]

def to_index_text(*parts: str) -> str:
    return " ".join(p for p in parts if isinstance(p, str) and p.strip())

def jaccard(a: List[str], b: List[str]) -> float:
    if not a or not b:
        return 0.0
    sa, sb = set(a), set(b)
    inter = len(sa & sb)
    uni = len(sa | sb)
    return 0.0 if uni == 0 else inter / uni

def has_whole_word_ci(word: str, text: str) -> bool:
    if not word or not text:
        return False
    pattern = rf"(?<![A-Za-zА-Яа-яЁё0-9]){re.escape(word)}(?![A-Za-zА-Яа-яЁё0-9])"
    return re.search(pattern, text, flags=re.IGNORECASE) is not None

def _normalize_query_for_matching(text: str) -> str:
    tokens = tokenize_lower(text or "")
    tokens = [t for t in tokens if not t.startswith("цедр")]
    merged: List[str] = []
    i = 0
    while i < len(tokens):
        if i + 1 < len(tokens) and tokens[i] == "кус" and tokens[i+1] == "кус":
            merged.append("кускус")
            i += 2
            continue
        merged.append(tokens[i]); i += 1
    return " ".join(merged).strip()

def _rewrite_special_ingredients(text: str) -> Optional[str]:
    t = (text or "").lower()
    if "желтк" in t:
        return "яйцо куриное"
    return None

_GENERIC_RU: Set[str] = {"масло","смесь","икра","сироп","крем","тесто","пюре","соус","напиток", "цедра"} # Эти слова могут требовать дополнительных уточнений

### Класс-обертка **FaissTfidfIndex** для поиска по текстам

Строит TF-IDF -> SVD эмбеддинги, L2-нормирует и индексирует в Faiss


*   build — обучение и добавление в индекс
*   embed_query — преобразование запроса в вектор
*   score_query — индексы top-k кандидатов и их scores
*   score_doc — косинусная близость запроса к одному документу


In [None]:
@dataclass
class FaissTfidfIndex:
    vectorizer: TfidfVectorizer
    svd: TruncatedSVD
    index: "faiss.Index"
    X: np.ndarray

    @classmethod
    def build(cls, docs: Sequence[str], *, dims: int = 256, max_features: int = 200_000) -> "FaissTfidfIndex":
        texts = [d if isinstance(d, str) else "" for d in docs]
        vec = TfidfVectorizer(
            analyzer="word",
            token_pattern=r"(?u)[A-Za-zА-Яа-яЁё0-9]+",
            lowercase=True,
            min_df=1,
            max_features=max_features,
        )
        M = vec.fit_transform(texts)
        svd = TruncatedSVD(n_components=dims, random_state=42)
        X = svd.fit_transform(M).astype("float32")
        faiss.normalize_L2(X)
        index = faiss.IndexFlatIP(dims)
        index.add(X)
        return cls(vectorizer=vec, svd=svd, index=index, X=X)

    def _embed_query(self, query: str) -> np.ndarray:
        q = self.vectorizer.transform([query or ""])
        q = self.svd.transform(q).astype("float32")
        faiss.normalize_L2(q)
        return q

    def score_query(self, query: str, top_k: int = 200) -> Tuple[np.ndarray, np.ndarray]:
        q = self._embed_query(query)
        scores, idx = self.index.search(q, min(top_k, self.X.shape[0]))
        return scores[0], idx[0]

    def score_doc(self, query: str, doc_idx: int) -> float:
        q = self._embed_query(query)
        return float(np.dot(self.X[doc_idx], q[0]))

### **Индекс каталога ВкусВилл**

*   **VVCatalogItem** хранит метаданные продукта
*   **VVCatalogIndex.fit** подготавливает списки текстов, строит список для fuzzy и Faiss-индекс по названиям
*   **_candidate_ids** возвращает top-k кандидатов из Faiss
*   **_fuzzy_title_score / _faiss_title_score**: сигналы схожести
*   **match_one**:
    - Нормализует и токенизирует запрос (ингредиент), собирает кандидатов
    - Считает сигналы: Jaccard, cosine (c_pos), fuzzy, whole-word
    - Доменные правила («кускус», «яйца»), защитный гейт, взвешенный скоринг, штрафы
    - Возвращает топ‑результаты с `product_id`
*   **match_many** агрегирует результаты по списку ингредиентов в DataFrame, добавляет `match_id` (product_id)


In [18]:
@dataclass
class VVCatalogItem:
    doc_id: int
    product_id: object
    title: str
    category: str
    subcategory: str

class VVCatalogIndex:
    def __init__(
        self,
        penalize_ready_to_eat: bool = True,
    ) -> None:
        self.items: List[VVCatalogItem] = []
        self.title_texts: List[str] = []
        self.all_texts: List[str] = []
        self.penalize_ready_to_eat = penalize_ready_to_eat
        self._titles_fuzzy: List[str] = []
        self._faiss_title: Optional[FaissTfidfIndex] = None

    def fit(self, df: pd.DataFrame, text_fields: Sequence[str] = ("title","category","subcategory")) -> "VVCatalogIndex":
        df = df.copy()

        self.items = [
            VVCatalogItem(
                doc_id=int(i),
                product_id=r["id"],
                title=str(r["title"]),
                category=str(r["category"]),
                subcategory=str(r["subcategory"]),
            )
            for i, r in df.iterrows()
        ]

        self.title_texts = [str(r["title"]) for _, r in df.iterrows()]
        self.all_texts = [to_index_text(r["title"], r["category"], r["subcategory"]) for _, r in df.iterrows()]

        self._titles_fuzzy = [str(t).lower() for t in self.title_texts]
        self._faiss_title = FaissTfidfIndex.build(self.title_texts, dims=256)
        return self

    def _candidate_ids(self, query: str, k_title: int = 600, k_all: int = 150, k_faiss: int = 200) -> List[int]:
        cand_ids: List[int] = []
        if self._faiss_title is not None:
            _, idx3 = self._faiss_title.score_query(query, top_k=k_faiss)
            cand_ids.extend(idx3.tolist())
        seen, unique = set(), []
        for cid in cand_ids:
            if cid not in seen:
                unique.append(cid)
                seen.add(cid)
        return unique

    def _fuzzy_title_score(self, query: str, doc_idx: int) -> float:
        if not self._titles_fuzzy:
            return 0.0
        q = query.lower()
        return float(fuzz.token_set_ratio(q, self._titles_fuzzy[doc_idx]) / 100.0)

    def _faiss_title_score(self, query: str, doc_idx: int) -> float:
        if self._faiss_title is None:
            return 0.0
        return self._faiss_title.score_doc(query, doc_idx)

    def match_one(
        self,
        ingredient_name: str,
        *,
        top_k: int = 5,
        k_faiss: int = 200,
        min_title_signal: float = 0.02,
        min_fuzzy: float = 0.7,
        min_vec_sim: float = 0.18,
        w_fuzzy: float = 0.8,
        penalty_mismatch: float = 0.8,
    ) -> List[Dict]:
        query = ingredient_name

        raw_query = ingredient_name
        rewrite = _rewrite_special_ingredients(raw_query)
        query = _normalize_query_for_matching(rewrite or raw_query)

        q_tokens_all = tokenize_lower(query)
        if not q_tokens_all:
            return []
        q_main = q_tokens_all
        first_key = q_main[0]

        cand_ids = self._candidate_ids(query, k_faiss=k_faiss)
        if not cand_ids:
            return []

        results = []

        for cid in cand_ids:
            it = self.items[cid]
            title_tokens = tokenize_lower(it.title)

            title_l = it.title.lower()
            cat_l = it.category.lower()
            subcat_l = it.subcategory.lower()

            allow_by_word = any(has_whole_word_ci(t, it.title) for t in q_main)
            title_overlap = jaccard(q_main, title_tokens)
            vec_sim_raw = self._faiss_title_score(query, cid)
            c_pos = max(0.0, vec_sim_raw)
            fuzzy = self._fuzzy_title_score(query, cid)

            score = 0.0

            if "кускус" in q_main:
                if "кускус" not in title_tokens and fuzzy < 0.85:
                    continue
                if "вкус" in title_l:
                    score -= 1.5
                if cat_l.startswith("напит"):
                    score -= 3.0
                if ("круп" in cat_l) or ("круп" in subcat_l):
                    score += 0.8

            if ("яйца" in q_main) or ("яйцо" in q_main):
                if "курин" in title_l:
                    score += 0.6
                if "перепел" in title_l:
                    score -= 1.0
                if "копчен" in title_l:
                    score -= 2.5
                if "яйц" in subcat_l:
                    score += 0.6

            if not allow_by_word and title_overlap < min_title_signal and c_pos < min_vec_sim and fuzzy < min_fuzzy:
                continue

            score += 1.0 * title_overlap
            score += 1.8 * c_pos
            score += 1.5 * fuzzy

            if (first_key in _GENERIC_RU or q_main[-1] in _GENERIC_RU) and len(q_main) >= 2:
                if len(set(q_main[1:]) & set(title_tokens)) == 0:
                    score -= 1.2

            if first_key not in title_tokens and c_pos < 0.25:
                score -= penalty_mismatch

            reasons = {
                "title_overlap": round(float(title_overlap), 4),
                "vec_sim": round(float(c_pos), 4),
                "fuzzy": round(float(fuzzy), 4),
            }
            results.append((score, cid, reasons))

        # fallback
        if not results and cand_ids:
            best_allowed = None
            best_allowed_score = -1e9
            best_any = None
            best_any_score = -1e9

            for cid in cand_ids[:200]:
                it = self.items[cid]
                title_tokens = tokenize_lower(it.title)
                title_l = it.title.lower()
                cat_l = it.category.lower()
                subcat_l = it.subcategory.lower()

                c_raw = self._faiss_title_score(query, cid)
                c_pos = max(0.0, c_raw)
                f = self._fuzzy_title_score(query, cid)

                allow = (f >= 0.7) or any(has_whole_word_ci(t, it.title) for t in q_main)
                sc_allowed = 0.6 * c_pos + 0.4 * f
                if first_key not in title_tokens and c_pos < 0.25:
                    sc_allowed -= penalty_mismatch
                if "кускус" in q_main:
                    if "кускус" not in title_tokens and f < 0.85:
                        allow = False
                    if "вкус" in title_l:
                        sc_allowed -= 1.0
                    if ("круп" in cat_l) or ("круп" in subcat_l):
                        sc_allowed += 0.4

                if allow and sc_allowed > best_allowed_score:
                    best_allowed_score = sc_allowed
                    best_allowed = cid

                sc_any = 0.7 * c_pos + 0.3 * f

                if sc_any > best_any_score:
                    best_any_score = sc_any
                    best_any = cid

            if best_allowed is not None and best_allowed_score >= 0.65:
                it = self.items[best_allowed]
                return [{
                    "score": round(float(best_allowed_score), 4),
                    "doc_id": it.doc_id,
                    "product_id": it.product_id,
                    "title": it.title,
                    "category": it.category,
                    "subcategory": it.subcategory,
                    "reasons": {"fallback": True, "mode": "allowed"}
                }][:top_k]

            chosen = best_allowed if best_allowed is not None else best_any
            if chosen is not None:
                it = self.items[chosen]
                return [{
                    "score": round(float(best_any_score if chosen == best_any else best_allowed_score), 4),
                    "doc_id": it.doc_id,
                    "product_id": it.product_id,
                    "title": it.title,
                    "category": it.category,
                    "subcategory": it.subcategory,
                    "reasons": {
                        "fallback": True,
                        "mode": "forced",
                        "sc_allowed": round(float(best_allowed_score), 4) if best_allowed is not None else None,
                        "sc_any": round(float(best_any_score), 4) if best_any is not None else None,
                    }
                }][:top_k]

            return []


        if not results:
            return []

        results.sort(key=lambda x: x[0], reverse=True)
        top = results[:top_k]

        out: List[Dict] = []
        for score, cid, reasons in top:
            it = self.items[cid]
            out.append({
                "score": round(float(score), 4),
                "doc_id": it.doc_id,
                "product_id": it.product_id,
                "title": it.title,
                "category": it.category,
                "subcategory": it.subcategory,
                "reasons": reasons,
            })

        return out

    def match_many(self, ingredients: Iterable[str], top_k: int = 3, **kwargs) -> pd.DataFrame:
        rows = []
        for ing in ingredients:
            matches = self.match_one(ing, top_k=top_k, **kwargs)
            if not matches:
                rows.append({
                    "ingredient": ing,
                    "match_title": None,
                    "match_category": None,
                    "match_subcategory": None,
                    "match_id": None,
                    "score": None,
                    "reasons": {},
                })
                continue
            for m in matches:
                rows.append({
                    "ingredient": ing,
                    "match_title": m["title"],
                    "match_category": m["category"],
                    "match_subcategory": m["subcategory"],
                    "match_id": m["product_id"],
                    "score": m["score"],
                    "reasons": m["reasons"],
                })
        return pd.DataFrame(rows)

### **Матчинг ингредиентов из Поваренка во ВкусВилл**

In [29]:
vv_path = Path("products_final_categories2_short_names.csv")
vv_df_raw = df = pd.read_csv(vv_path, sep=None, engine="python", encoding="utf-8", on_bad_lines="skip").copy()

vv_df = pd.DataFrame({
    "id": vv_df_raw["id"],
    "title": vv_df_raw["name"],
    "category": vv_df_raw["metadata"].apply(lambda x: json.loads(x)['category']),
    "subcategory": vv_df_raw["metadata"].apply(lambda x: json.loads(x)['subcategory'])
})

vv_df["title"] = vv_df["title"].str.replace(r"\s+", " ", regex=True).str.strip()

print(f"Загружено продуктов ВкусВилла: {len(vv_df)}")

Загружено продуктов ВкусВилла: 8709


In [None]:
idx = VVCatalogIndex(penalize_ready_to_eat=True).fit(vv_df[["id", "title", "category", "subcategory"]])
pov_ings = pov_ingr_list["ingredient"].tolist()

df_matches = idx.match_many(
    pov_ings,
    top_k=3,
    min_title_signal=0.02,
    penalty_mismatch=0.6,
)

In [24]:
print(df_matches.head(30).to_string(index=False))

              ingredient                                   match_title                     match_category                         match_subcategory  match_id  score                                                                    reasons
            Икра овощная Закуска овощная "Аджапсандал" низкокалорийная                        Консервация                  Хумус, закуски и соленья   10274.0 2.4109                 {'title_overlap': 0.2, 'vec_sim': 0.6142, 'fuzzy': 0.7368}
            Икра овощная  Смесь овощная "Паприкаш", быстрозамороженная              Замороженные продукты Замороженные овощи, грибы, ягоды и фрукты    7977.0 2.2635                 {'title_overlap': 0.2, 'vec_sim': 0.5323, 'fuzzy': 0.7368}
            Икра овощная          Смесь овощная "Карибская", быстрозам              Замороженные продукты Замороженные овощи, грибы, ягоды и фрукты    7968.0 2.2409                 {'title_overlap': 0.2, 'vec_sim': 0.5198, 'fuzzy': 0.7368}
                Лимонник                

In [25]:
df_matches.to_csv('pov_to_vv.csv', index=False)

____

### **Матчинг продуктов из ВкусВилла в Поваренок**

Сделаем обратный матчинг из продуктов ВкусВилла в ингридиенты Поваренка (структура аналогична матчингу из Поваренка во ВкусВилл). Строим Faiss индекс ингредиентов, для каждого product_title находим релевантные ингриденты на основе посчитанного покрытия ингредиента в названии (coverage), косинусной близости в эмбеддингах (c_pos), Jaccard по токенам и fuzzy (token_set_ratio). Кандидаты ранжируются по взвешенному score, в итоге будем использовать top-1 для каждого продукта из ВкусВилла

In [None]:
@dataclass
class PovIngredientItem:
    doc_id: int
    ingredient: str

class PovIngredientsIndex:
    def __init__(self) -> None:
        self.items: List[PovIngredientItem] = []
        self.ingredient_texts: List[str] = []
        self._ingredients_fuzzy: List[str] = []
        self._faiss_ing: Optional[FaissTfidfIndex] = None

    def fit(self, df: pd.DataFrame) -> "PovIngredientsIndex":
        df = df.copy()
        self.items = [
            PovIngredientItem(
                doc_id=int(i),
                ingredient=str(r["ingredient"]),
            )
            for i, r in df.iterrows()
        ]
        self.ingredient_texts = [it.ingredient for it in self.items]
        self._ingredients_fuzzy = [t.lower() for t in self.ingredient_texts]
        self._faiss_ing = FaissTfidfIndex.build(self.ingredient_texts, dims=256)
        return self

    def _candidate_ids(self, query: str, k_faiss: int = 250) -> List[int]:
        cand_ids: List[int] = []
        if self._faiss_ing is not None:
            _, idx = self._faiss_ing.score_query(query, top_k=k_faiss)
            cand_ids.extend(idx.tolist())
        seen, unique = set(), []
        for cid in cand_ids:
            if cid not in seen:
                unique.append(cid)
                seen.add(cid)
        return unique

    def _fuzzy_ing_score(self, query: str, doc_idx: int) -> float:
        if not self._ingredients_fuzzy:
            return 0.0
        q = query.lower()
        return float(fuzz.token_set_ratio(q, self._ingredients_fuzzy[doc_idx]) / 100.0)

    def _faiss_ing_score(self, query: str, doc_idx: int) -> float:
        if self._faiss_ing is None:
            return 0.0
        return self._faiss_ing.score_doc(query, doc_idx)

    def match_one(
        self,
        product_title: str,
        *,
        top_k: int = 5,
        k_faiss: int = 250,
        min_coverage_ing: float = 0.5,
        min_fuzzy: float = 0.75,
        min_vec_sim: float = 0.15,
        penalty_mismatch: float = 0.6,
    ) -> List[Dict]:
        query = product_title
        q_tokens = tokenize_lower(query)
        cand_ids = self._candidate_ids(query, k_faiss=k_faiss)
        if not cand_ids:
            return []

        q_tokens_set = set(q_tokens)
        results = []

        for cid in cand_ids:
            it = self.items[cid]
            ing_tokens = tokenize_lower(it.ingredient)
            if not ing_tokens:
                continue
            ing_tokens_set = set(ing_tokens)

            allow_by_word = any(has_whole_word_ci(t, query) for t in ing_tokens)
            title_overlap = jaccard(q_tokens, ing_tokens)
            vec_sim_raw = self._faiss_ing_score(query, cid)
            c_pos = max(0.0, vec_sim_raw)
            fuzzy = self._fuzzy_ing_score(query, cid)

            inter = len(q_tokens_set & ing_tokens_set)
            coverage_ing = inter / max(1, len(ing_tokens_set))

            if not allow_by_word and coverage_ing < min_coverage_ing and c_pos < min_vec_sim and fuzzy < min_fuzzy:
                continue

            score = 0.0
            score += 1.2 * coverage_ing
            score += 0.6 * title_overlap
            score += 1.6 * c_pos
            score += 1.5 * fuzzy

            if len(ing_tokens) == 1 and ing_tokens[0] in _GENERIC_RU:
                if not allow_by_word and fuzzy < 0.8:
                    score -= 1.0

            if not allow_by_word and c_pos < 0.25:
                score -= penalty_mismatch

            reasons = {
                "coverage_ing": round(float(coverage_ing), 4),
                "title_overlap": round(float(title_overlap), 4),
                "vec_sim": round(float(c_pos), 4),
                "fuzzy": round(float(fuzzy), 4),
            }
            results.append((score, cid, reasons))

        # fallback
        if not results and cand_ids:
            best_allowed = None
            best_allowed_score = -1e9
            best_any = None
            best_any_score = -1e9

            for cid in cand_ids[:200]:
                it = self.items[cid]
                ing_tokens = tokenize_lower(it.ingredient)
                if not ing_tokens:
                    continue
                ing_tokens_set = set(ing_tokens)

                allow_by_word = any(has_whole_word_ci(t, query) for t in ing_tokens)
                c_raw = self._faiss_ing_score(query, cid)
                c_pos = max(0.0, c_raw)
                f = self._fuzzy_ing_score(query, cid)

                inter = len(q_tokens_set & ing_tokens_set)
                coverage_ing = inter / max(1, len(ing_tokens_set))

                allow = allow_by_word or (f >= 0.8) or (coverage_ing >= 0.5)

                sc_allowed = 0.6 * c_pos + 0.4 * f
                if not allow_by_word and c_pos < 0.25:
                    sc_allowed -= penalty_mismatch
                if len(ing_tokens) == 1 and ing_tokens[0] in _GENERIC_RU and not allow_by_word:
                    sc_allowed -= 0.8

                if allow and sc_allowed > best_allowed_score:
                    best_allowed_score = sc_allowed
                    best_allowed = cid

                sc_any = 0.7 * c_pos + 0.3 * f
                if len(ing_tokens) == 1 and ing_tokens[0] in _GENERIC_RU:
                    sc_any -= 0.3
                if sc_any > best_any_score:
                    best_any_score = sc_any
                    best_any = cid

            if best_allowed is not None and best_allowed_score >= 0.65:
                it = self.items[best_allowed]
                return [{
                    "score": round(float(best_allowed_score), 4),
                    "doc_id": it.doc_id,
                    "ingredient": it.ingredient,
                    "reasons": {"fallback": True, "mode": "allowed"},
                }][:top_k]

            chosen = best_allowed if best_allowed is not None else best_any
            if chosen is not None:
                it = self.items[chosen]
                return [{
                    "score": round(float(best_any_score if chosen == best_any else best_allowed_score), 4),
                    "doc_id": it.doc_id,
                    "ingredient": it.ingredient,
                    "reasons": {
                        "fallback": True,
                        "mode": "forced"
                    },
                }][:top_k]

            return []

        if not results:
            return []

        results.sort(key=lambda x: x[0], reverse=True)
        top = results[:top_k]

        out: List[Dict] = []
        for score, cid, reasons in top:
            it = self.items[cid]
            out.append({
                "score": round(float(score), 4),
                "doc_id": it.doc_id,
                "ingredient": it.ingredient,
                "reasons": reasons,
            })
        return out

    def match_many(self, product_titles: Iterable[str], top_k: int = 3, product_ids: Optional[Iterable] = None, **kwargs) -> pd.DataFrame:
        titles = list(product_titles)
        ids = list(product_ids) if product_ids is not None else [None] * len(titles)
        if len(ids) != len(titles):
            if len(ids) < len(titles):
                ids = ids + [None] * (len(titles) - len(ids))
            else:
                ids = ids[:len(titles)]

        rows = []
        for pid, title in zip(ids, titles):
            matches = self.match_one(title, top_k=top_k, **kwargs)
            if not matches:
                rows.append({
                    "product_id": pid,
                    "product_title": title,
                    "match_ingredient": None,
                    "score": None,
                    "reasons": {},
                })
                continue
            for m in matches:
                rows.append({
                    "product_id": pid,
                    "product_title": title,
                    "match_ingredient": m["ingredient"],
                    "score": m["score"],
                    "reasons": m["reasons"],
                })
        return pd.DataFrame(rows)

In [None]:
vv_df = pd.DataFrame({
    "id": vv_df_raw["id"],
    "title": vv_df_raw["short_name"],
    "category": vv_df_raw["metadata"].apply(lambda x: json.loads(x)['category']),
    "subcategory": vv_df_raw["metadata"].apply(lambda x: json.loads(x)['subcategory']),
})

In [None]:
pov_idx = PovIngredientsIndex().fit(pov_ingr_list[["ingredient"]])
vv_titles = vv_df["title"].tolist()
vv_titles[:5]

['Абрикос мытый',
 'Абрикос планета витаминов замороженный',
 'Абрикосы сушеные',
 'Аджика',
 'Аджика']

In [None]:
df_rev = pov_idx.match_many(vv_df["title"].tolist(), product_ids=vv_df["id"].tolist(), top_k=3)

In [27]:
print(df_rev[:30].to_string(index=False))

 product_id                          product_title     match_ingredient  score                                                                            reasons
          1                          Абрикос мытый              Абрикос 4.6000          {'coverage_ing': 1.0, 'title_overlap': 0.5, 'vec_sim': 1.0, 'fuzzy': 1.0}
          1                          Абрикос мытый               Бекмес 0.2596    {'coverage_ing': 0.0, 'title_overlap': 0.0, 'vec_sim': 0.2412, 'fuzzy': 0.3158}
          1                          Абрикос мытый      Творожная масса 0.2273    {'coverage_ing': 0.0, 'title_overlap': 0.0, 'vec_sim': 0.2492, 'fuzzy': 0.2857}
          2 Абрикос планета витаминов замороженный              Абрикос 4.4500         {'coverage_ing': 1.0, 'title_overlap': 0.25, 'vec_sim': 1.0, 'fuzzy': 1.0}
          2 Абрикос планета витаминов замороженный      Творожная масса 0.4214    {'coverage_ing': 0.0, 'title_overlap': 0.0, 'vec_sim': 0.2492, 'fuzzy': 0.4151}
          2 Абрикос планета 

In [17]:
df_rev.to_csv('vv_to_pov.csv', index=False)