In [1]:
# Explicit Matching Models for Laptop Evaluation
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.neighbors import NearestNeighbors
import faiss
from collections import defaultdict

In [2]:
# Load full dataset
train_path = "data/df_train_CLIP_rating_based.csv"
test_path = "data/df_test_ground_truth_rating_based.csv"

In [3]:
df_test = pd.read_csv(test_path)
df_train = pd.read_csv(
    train_path,
    na_values=[""],  # исключаем "Unknown"
    keep_default_na=False
)

In [4]:
# Построение ground_truth
ground_truth = (
    df_test[df_test["label"] == 1]
    .groupby("user_id")["item_id"]
    .apply(set)
    .to_dict()
)

In [5]:
# Precision@K and MAP@K
def precision_at_k(preds, ground_truth, k=10):
    scores = []
    for user, pred_items in preds.items():
        if user not in ground_truth:
            continue
        gt_items = ground_truth[user]
        hits = sum([1 for item in pred_items[:k] if item in gt_items])
        scores.append(hits / k)
    return round(np.mean(scores), 4)

def apk(pred, actual, k=10):
    if not actual:
        return 0.0
    pred = pred[:k]
    score, num_hits = 0.0, 0.0
    for i, p in enumerate(pred):
        if p in actual and p not in pred[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    return score / min(len(actual), k)

def map_at_k(preds, ground_truth, k=10):
    return round(
        np.mean([apk(preds[u], ground_truth[u], k) for u in preds if u in ground_truth]), 4
    )

## Model Functions

In [6]:
# KNN over CLIP Embeddings 
def run_clip_knn(df_train, df_test, k):
    emb_cols = [c for c in df_train.columns if c.startswith("clip_text_") or c.startswith("clip_img_")]
    df_items = df_train.drop_duplicates("item_id")[["item_id"] + emb_cols].dropna().set_index("item_id")
    model_knn = NearestNeighbors(n_neighbors=k+1, metric="cosine")
    model_knn.fit(df_items.values)
    preds = {}
    for uid, group in df_test.groupby("user_id"):
        items = [i for i in group["item_id"].unique() if i in df_items.index]
        if not items:
            continue
        sim = []
        for i in items:
            idx = df_items.index.get_loc(i)
            _, inds = model_knn.kneighbors([df_items.iloc[idx].values])
            sim.extend(df_items.iloc[inds[0][1:]].index.tolist())
        preds[uid] = sim[:10]
    return preds

In [7]:
# FAISS  over CLIP Embeddings
def run_clip_faiss(df_train, df_test, top_k=10):
    emb_cols = [c for c in df_train.columns if c.startswith("clip_text_") or c.startswith("clip_img_")]
    df_items = df_train.drop_duplicates("item_id")[["item_id"] + emb_cols].dropna().set_index("item_id")
    
    # Индексация через FAISS
    index = faiss.IndexFlatIP(len(emb_cols))  # dot-product
    item_vectors = np.ascontiguousarray(df_items.values.astype("float32"))
    faiss.normalize_L2(item_vectors)
    index.add(item_vectors)
    item_ids = df_items.index.tolist()

    preds = {}
    for uid, group in df_test.groupby("user_id"):
        user_items = [i for i in group["item_id"].unique() if i in df_items.index]
        if not user_items:
            continue

        user_vec = np.ascontiguousarray(df_items.loc[user_items]
                                        .values.astype("float32"))
        faiss.normalize_L2(user_vec)
        user_mean = np.mean(user_vec, axis=0, keepdims=True)

        _, indices = index.search(user_mean, top_k)
        preds[uid] = [item_ids[i] for i in indices[0]]
    return preds

In [8]:
# Run grid search
def evaluate_model(preds, ground_truth):
    return {
        "precision": precision_at_k(preds, ground_truth),
        "map": map_at_k(preds, ground_truth)
    }

In [9]:
results = []

In [10]:
for k in tqdm([5, 10, 20], desc="CLIP-KNN"):
    print(f"Started iter for k = {k}")
    preds = run_clip_knn(df_train, df_test, k)
    print("Preds done")
    score = evaluate_model(preds, ground_truth)
    print("Score calculated")
    results.append(("CLIP_KNN", k, score))
    print(f"Finished iter for k = {k}")

CLIP-KNN:   0%|          | 0/3 [00:00<?, ?it/s]

Started iter for k = 5
Preds done


CLIP-KNN:  33%|███▎      | 1/3 [2:34:12<5:08:25, 9252.63s/it]

Score calculated
Finished iter for k = 5
Started iter for k = 10


CLIP-KNN:  67%|██████▋   | 2/3 [5:07:03<2:33:24, 9204.71s/it]

Preds done
Score calculated
Finished iter for k = 10
Started iter for k = 20
Preds done


CLIP-KNN: 100%|██████████| 3/3 [7:38:45<00:00, 9175.18s/it]  

Score calculated
Finished iter for k = 20





In [11]:
# --- FAISS-based CLIP matching ---
for k in tqdm([5, 10, 20], desc="CLIP-FAISS"):
    print(f"Started iter for k = {k}")
    preds = run_clip_faiss(df_train, df_test, top_k=k)
    print("Preds done")
    score = evaluate_model(preds, ground_truth)
    print("Score calculated")
    results.append(("CLIP_FAISS", k, score))
    print(f"Finished iter for k = {k}")

CLIP-FAISS:   0%|          | 0/3 [00:00<?, ?it/s]

Started iter for k = 5
Preds done


CLIP-FAISS:  33%|███▎      | 1/3 [04:22<08:44, 262.24s/it]

Score calculated
Finished iter for k = 5
Started iter for k = 10
Preds done


CLIP-FAISS:  67%|██████▋   | 2/3 [08:41<04:20, 260.40s/it]

Score calculated
Finished iter for k = 10
Started iter for k = 20
Preds done


CLIP-FAISS: 100%|██████████| 3/3 [13:01<00:00, 260.58s/it]

Score calculated
Finished iter for k = 20





In [12]:
# Print best results
print("\nBest results by model:")
model_scores = defaultdict(list)
for name, param, score in results:
    model_scores[name].append((param, score))


Best results by model:


In [13]:
for model, entries in model_scores.items():
    best = max(entries, key=lambda x: x[1]["precision"])
    print(f"{model}: best_param={best[0]}, precision={best[1]['precision']}, map={best[1]['map']}")

CLIP_KNN: best_param=5, precision=0.0085, map=0.015
CLIP_FAISS: best_param=10, precision=0.1358, map=0.9176
