In [None]:
# KNN и FAISS
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.neighbors import NearestNeighbors
import faiss
from collections import defaultdict
import random
import torch

In [2]:
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [3]:
seed_everything(42)

In [None]:
train_path = "data/df_train_CLIP_rating_based.csv"
test_path = "data/df_test_ground_truth_rating_based.csv"

In [5]:
df_test = pd.read_csv(test_path)
df_train = pd.read_csv(
    train_path,
    na_values=[""],  # исключаем "Unknown"
    keep_default_na=False
)

In [6]:
# Построение ground_truth
ground_truth = (
    df_test[df_test["label"] == 1]
    .groupby("user_id")["item_id"]
    .apply(set)
    .to_dict()
)

In [7]:
# Выбираем CLIP-эмбеддинги (текст + изображение)
item_vector_cols = [col for col in df_train.columns if col.startswith("clip_text_") or col.startswith("clip_img_")]
clip_vectors = df_train.drop_duplicates("item_id")[["item_id"] + item_vector_cols].set_index("item_id")
item_vectors = clip_vectors.copy()

In [8]:
# Строим user-векторы только для пользователей с положительными интеракциями в трейне
user_vectors = {}
for user_id, group in tqdm(df_train[df_train["label"] == 1].groupby("user_id"), desc="User vector aggregation"):
    item_ids = group["item_id"].values
    vectors = item_vectors.loc[item_ids].values
    user_vectors[user_id] = np.mean(vectors, axis=0)

User vector aggregation: 100%|██████████| 551853/551853 [02:24<00:00, 3819.60it/s]


In [9]:
valid_users_set = set(user_vectors.keys())

In [None]:
# Precision@K и MAP@K
def precision_at_k(preds, ground_truth, k=10):
    scores = []
    for user, pred_items in preds.items():
        if user not in ground_truth:
            continue
        gt_items = ground_truth[user]
        hits = sum([1 for item in pred_items[:k] if item in gt_items])
        scores.append(hits / k)
    return round(np.mean(scores), 4)

def apk(pred, actual, k=10):
    if not actual:
        return 0.0
    pred = pred[:k]
    score, num_hits = 0.0, 0.0
    for i, p in enumerate(pred):
        if p in actual and p not in pred[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    return score / min(len(actual), k)

def map_at_k(preds, ground_truth, k=10):
    return round(
        np.mean([apk(preds[u], ground_truth[u], k) for u in preds if u in ground_truth]), 4
    )

## Model Functions

In [None]:
# KNN по CLIP-эмбеддингам
def run_clip_knn(user_vectors, item_vectors, top_k):
    model_knn = NearestNeighbors(n_neighbors=top_k, metric="cosine", n_jobs=-1)
    model_knn.fit(item_vectors.values)
    item_ids = item_vectors.index.tolist()

    preds = {}
    for user_id, user_vec in user_vectors.items():
        _, indices = model_knn.kneighbors([user_vec])
        preds[user_id] = [item_ids[i] for i in indices[0]]

    return preds

In [None]:
# FAISS по CLIP-эмбеддингам
def run_clip_faiss(user_vectors, item_vectors, top_k=10):
    item_matrix = np.ascontiguousarray(item_vectors.values.astype("float32"))
    faiss.normalize_L2(item_matrix)

    index = faiss.IndexFlatIP(item_matrix.shape[1])
    index.add(item_matrix)
    item_ids = item_vectors.index.tolist()

    preds = {}
    for user_id, user_vec in user_vectors.items():
        user_vec = np.ascontiguousarray(user_vec.astype("float32")).reshape(1, -1)
        faiss.normalize_L2(user_vec)

        _, indices = index.search(user_vec, top_k)
        preds[user_id] = [item_ids[i] for i in indices[0]]

    return preds

In [None]:
def evaluate_model(preds, ground_truth, valid_users=None):
    if valid_users is not None:
        preds = {u: preds[u] for u in preds if u in valid_users}
    return {
        "precision": precision_at_k(preds, ground_truth),
        "map": map_at_k(preds, ground_truth)
    }

In [14]:
results = []

In [15]:
for k in tqdm([5, 10, 20], desc="CLIP-KNN"):
    print(f"Started iter for k = {k}")
    preds = run_clip_knn(user_vectors, item_vectors, k)
    print("Preds done")
    score = evaluate_model(preds, ground_truth, valid_users=valid_users_set)
    print("Score calculated")
    results.append(("CLIP_KNN", k, score))
    print(f"Finished iter for k = {k}")

CLIP-KNN:   0%|          | 0/3 [00:00<?, ?it/s]

Started iter for k = 5
Preds done


CLIP-KNN:  33%|███▎      | 1/3 [6:44:36<13:29:12, 24276.39s/it]

Score calculated
Finished iter for k = 5
Started iter for k = 10
Preds done


CLIP-KNN:  67%|██████▋   | 2/3 [13:22:11<6:40:28, 24028.79s/it]

Score calculated
Finished iter for k = 10
Started iter for k = 20
Preds done


CLIP-KNN: 100%|██████████| 3/3 [19:58:39<00:00, 23973.14s/it]  

Score calculated
Finished iter for k = 20





In [None]:
for k in tqdm([3, 5, 8, 10, 12, 15, 20, 30, 50, 100], desc="CLIP-FAISS"):
    print(f"Started iter for k = {k}")
    preds = run_clip_faiss(user_vectors, item_vectors, k)
    print("Preds done")
    score = evaluate_model(preds, ground_truth, valid_users=valid_users_set)
    print("Score calculated")
    results.append(("CLIP_FAISS", k, score))
    print(f"Finished iter for k = {k}")

CLIP-FAISS:   0%|          | 0/10 [00:00<?, ?it/s]

Started iter for k = 3
Preds done


CLIP-FAISS:  10%|█         | 1/10 [28:39<4:17:51, 1719.02s/it]

Score calculated
Finished iter for k = 3
Started iter for k = 5
Preds done


CLIP-FAISS:  20%|██        | 2/10 [56:44<3:46:36, 1699.56s/it]

Score calculated
Finished iter for k = 5
Started iter for k = 8
Preds done


CLIP-FAISS:  30%|███       | 3/10 [1:24:52<3:17:37, 1693.95s/it]

Score calculated
Finished iter for k = 8
Started iter for k = 10
Preds done


CLIP-FAISS:  40%|████      | 4/10 [1:53:01<2:49:11, 1691.91s/it]

Score calculated
Finished iter for k = 10
Started iter for k = 12
Preds done


CLIP-FAISS:  50%|█████     | 5/10 [2:20:58<2:20:32, 1686.53s/it]

Score calculated
Finished iter for k = 12
Started iter for k = 15
Preds done


CLIP-FAISS:  60%|██████    | 6/10 [2:48:45<1:52:00, 1680.20s/it]

Score calculated
Finished iter for k = 15
Started iter for k = 20
Preds done


CLIP-FAISS:  70%|███████   | 7/10 [3:16:10<1:23:25, 1668.64s/it]

Score calculated
Finished iter for k = 20
Started iter for k = 30
Preds done


CLIP-FAISS:  80%|████████  | 8/10 [3:43:31<55:19, 1659.91s/it]  

Score calculated
Finished iter for k = 30
Started iter for k = 50
Preds done


CLIP-FAISS:  90%|█████████ | 9/10 [4:11:01<27:36, 1656.52s/it]

Score calculated
Finished iter for k = 50
Started iter for k = 100
Preds done


CLIP-FAISS: 100%|██████████| 10/10 [4:39:07<00:00, 1674.70s/it]

Score calculated
Finished iter for k = 100





In [None]:
model_scores = defaultdict(list)
for name, param, score in results:
    model_scores[name].append((param, score))

In [18]:
for model, entries in model_scores.items():
    best = max(entries, key=lambda x: x[1]["precision"])
    print(f"{model}: best_param={best[0]}, precision={best[1]['precision']}, map={best[1]['map']}")

CLIP_KNN: best_param=10, precision=0.0032, map=0.008
CLIP_FAISS: best_param=10, precision=0.0032, map=0.008
