In [1]:
# Импорт библиотек
import pandas as pd
import numpy as np
import datetime
import random

In [2]:
# Загружаем основной датасет взаимодействий пользователей и товаров
df = pd.read_csv("data/amazon_interactions_filtered.csv")

In [3]:
# Оставим только взаимодействия, попадающие в период 2015–2018 годов
df = df[df["timestamp"] >= pd.Timestamp("2015-01-01").timestamp()].copy()

In [4]:
# Сортируем по timestamp, чтобы сохранить временную консистентность
df = df.sort_values("timestamp")

In [5]:
# Используем временной сплит: 2015–2017 как train, 2018 как test
df_train = df[df["timestamp"] < pd.Timestamp("2018-01-01").timestamp()]
df_test = df[(df["timestamp"] >= pd.Timestamp("2018-01-01").timestamp()) & 
             (df["timestamp"] < pd.Timestamp("2018-10-01").timestamp())]

In [6]:
# Подготовим два варианта: rating-based и interaction-based
df_train_rating_based = df_train.copy()
df_train_interaction_based = df_train.copy()

In [7]:
# Метка = 1, если рейтинг >= 4; иначе 0
df_train_rating_based["label"] = (df_train_rating_based["rating"] >= 4).astype(int)

In [8]:
# Все взаимодействия считаем положительными (label = 1)
df_train_interaction_based["label"] = 1

In [9]:
# Чтобы избежать утечки информации
df_train_rating_based = df_train_rating_based[["user_id", "item_id", "label"]].copy()
df_train_interaction_based = df_train_interaction_based[["user_id", "item_id", "label"]].copy()

In [10]:
# Проверим размер и баланс классов
print("Rating-based labels:\n", df_train_rating_based["label"].value_counts())
print("\nInteraction-based labels:\n", df_train_interaction_based["label"].value_counts())

Rating-based labels:
 label
1    1573593
0     299578
Name: count, dtype: int64

Interaction-based labels:
 label
1    1873171
Name: count, dtype: int64


In [11]:
# Считаем только взаимодействия с label = 1 (оценка >= 4)
df_test_rating = df_test[df_test["rating"] >= 4].copy()
df_test_rating["label"] = 1
ground_truth_rating = df_test_rating.groupby("user_id")["item_id"].apply(set).to_dict()

In [12]:
# Считаем любое взаимодействие положительным
df_test_interaction = df_test.copy()
df_test_interaction["label"] = 1
ground_truth_interaction = df_test_interaction.groupby("user_id")["item_id"].apply(set).to_dict()

In [13]:
# Быстрая проверка количества пользователей с ground_truth
print(f"Ground truth (rating-based): {len(ground_truth_rating)} пользователей")
print(f"Ground truth (interaction-based): {len(ground_truth_interaction)} пользователей")

Ground truth (rating-based): 140627 пользователей
Ground truth (interaction-based): 165753 пользователей


In [14]:
# Фиксируем список пользователей, которые пересекаются с трейном (уже был отбор выше)
# Этот список пригодится позже для фильтрации предсказаний
users_in_train = set(df_train["user_id"])

In [15]:
# Формируем множество пользователей, у которых есть user_vector (на основе положительных взаимодействий)
users_with_user_vector = set(df_train_rating_based[df_train_rating_based["label"] == 1]["user_id"])

In [16]:
# Отфильтровываем ground_truth для рейтингов — только юзеры с положительными примерами в трейне
ground_truth_rating = {u: items for u, items in ground_truth_rating.items() if u in users_with_user_vector}

In [17]:
# Для interaction-based можно оставить фильтрацию по users_in_train, если хочешь, но формально она уже не нужна — все interaction-пользователи участвуют в трейне с label=1
ground_truth_interaction = {u: items for u, items in ground_truth_interaction.items() if u in users_in_train}

In [18]:
# Загрузка мета-данных товаров
df_meta = pd.read_csv(
    "data/amazon_meta_clean.csv",  # путь к актуальному файлу
    na_values=[""],
    keep_default_na=False
)

In [19]:
# Переименование ключа для мержа
df_meta.rename(columns={"asin": "item_id"}, inplace=True)

In [20]:
# Удалим tf-idf признаки — они не нужны для CLIP
tfidf_cols = [col for col in df_meta.columns if col.startswith("tfidf_")]
df_meta = df_meta.drop(columns=tfidf_cols)

In [21]:
# Удалим строки с отсутствующими критичными полями
required_fields = ["text_full", "image_main", "brand"]
df_meta_clean = df_meta.dropna(subset=required_fields).copy()

In [22]:
# Удалим текст и изображения (они уже представлены в эмбеддингах)
df_meta_clean.drop(columns=["text_full", "image_main"], inplace=True)

In [23]:
# Проверим размер после очистки
print(f"Размер df_meta_clean после фильтрации: {df_meta_clean.shape}")

Размер df_meta_clean после фильтрации: (148948, 210)


In [24]:
# Мержим с train по рейтингам
df_train_rating_merged = df_train_rating_based.merge(
    df_meta_clean,
    on="item_id",
    how="inner"  # жёсткий мерж: нужны только товары с признаками
)

In [25]:
# Аналогично — для interaction-based
df_train_interaction_merged = df_train_interaction_based.merge(
    df_meta_clean,
    on="item_id",
    how="inner"
)

In [26]:
# Проверим, сколько осталось строк после мержа
print("Rating-based merged:", len(df_train_rating_merged))
print("Interaction-based merged:", len(df_train_interaction_merged))

Rating-based merged: 1635089
Interaction-based merged: 1635089


In [27]:
# Оставим только пользователей, у которых есть положительные примеры
# Это пригодится позже при агрегации user-векторов
positive_users_rating = set(df_train_rating_merged[df_train_rating_merged["label"] == 1]["user_id"])
positive_users_interaction = set(df_train_interaction_merged[df_train_interaction_merged["label"] == 1]["user_id"])

In [28]:
# Выделим только CLIP-эмбеддинги из df_meta_clean
clip_vector_cols = [col for col in df_meta_clean.columns if col.startswith("clip_text_") or col.startswith("clip_img_")]

In [29]:
# Подготовим item-векторы для инференса: item_id + эмбеддинги
item_vectors_full = df_meta_clean[["item_id"] + clip_vector_cols].drop_duplicates("item_id").set_index("item_id")

In [30]:
# Список item_id кандидатов
candidate_items = item_vectors_full.index.tolist()

In [31]:
# Матрица эмбеддингов товаров (будет подаваться в модель при инференсе)
candidate_vectors = item_vectors_full.values

## Создание hard negatives

In [32]:
# Оставим только нужные поля: item_id + clip-эмбеддинги
df_meta_sample = df_meta_clean[["item_id"] + clip_vector_cols].drop_duplicates("item_id").copy()
df_meta_sample = df_meta_sample.set_index("item_id")

In [33]:
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import sys

In [34]:
# Число негативных примеров на одного пользователя
n_neg = 3

### rating-based

In [35]:
import faiss

In [37]:
# Словарь: user_id → список item_id, которые пользователь оценил положительно
user_positive_items = (
    df_train_rating_merged[df_train_rating_merged["label"] == 1]
    .groupby("user_id")["item_id"]
    .apply(set)
    .to_dict()
)

In [38]:
# Преобразуем item-векторы в матрицу float32
item_matrix = np.ascontiguousarray(df_meta_sample.values.astype("float32"))
item_ids_all = df_meta_sample.index.tolist()

In [39]:
# Нормализуем item-векторы для косинусного сходства (L2-нормировка)
faiss.normalize_L2(item_matrix)

In [40]:
# Создаем FAISS index
index = faiss.IndexFlatIP(item_matrix.shape[1])
index.add(item_matrix)

In [41]:
# Хранилище для хард-негативов
hard_negatives = []

In [42]:
for user_id in tqdm(user_positive_items.keys(), desc="Generating hard negatives (FAISS)", file=sys.stdout):
    pos_items = user_positive_items[user_id]

    if len(pos_items) < 1:
        continue

    try:
        pos_vectors = df_meta_sample.loc[list(pos_items)].values.astype("float32")
    except KeyError:
        continue

    # Средний вектор интересов пользователя
    user_vec = np.mean(pos_vectors, axis=0).reshape(1, -1).astype("float32")
    faiss.normalize_L2(user_vec)

    # Поиск ближайших top-100 кандидатов
    D, I = index.search(user_vec, 100)

    # Отбираем top-N, исключая позитивные
    count = 0
    for idx in I[0]:
        candidate_item = item_ids_all[idx]
        if candidate_item not in pos_items:
            hard_negatives.append((user_id, candidate_item, 0))
            count += 1
        if count >= n_neg:
            break

Generating hard negatives (FAISS): 100%|██████████| 551853/551853 [1:08:30<00:00, 134.25it/s]


In [43]:
# Создание датафрейма с хард-негативами
df_hard_negatives_rating = pd.DataFrame(hard_negatives, columns=["user_id", "item_id", "label"])

In [44]:
# Проверим размер и немного примеров
print(f"Hard negatives (rating-based): {len(df_hard_negatives_rating):,} строк")
df_hard_negatives_rating.head()

Hard negatives (rating-based): 1,655,559 строк


Unnamed: 0,user_id,item_id,label
0,A000013090ZI3HIT9N5V,B00OOJM0LA,0
1,A000013090ZI3HIT9N5V,B01G68JPQ4,0
2,A000013090ZI3HIT9N5V,B0059AP67S,0
3,A00408825PVJW7GFLEGU,B00COBZGMU,0
4,A00408825PVJW7GFLEGU,B012XC8FYU,0


In [45]:
# Отбираем только положительные примеры
df_positive_rating = df_train_rating_merged[df_train_rating_merged["label"] == 1][["user_id", "item_id", "label"]].copy()

In [46]:
# Объединяем
df_train_rating_hard = pd.concat([df_positive_rating, df_hard_negatives_rating], ignore_index=True)

In [47]:
# Проверим финальный баланс классов
print("Финальный датасет (rating-based):")
print(df_train_rating_hard["label"].value_counts())

Финальный датасет (rating-based):
label
0    1655559
1    1374168
Name: count, dtype: int64


In [48]:
df_train_rating_hard.to_csv("data/df_train_rating_hard.csv", index=False)

### intersection

In [49]:
# Готовим item-векторы (уже должен быть сформирован df_meta_sample)
item_embeddings = df_meta_sample.copy()

In [50]:
# Преобразуем в матрицу float32
item_matrix = np.ascontiguousarray(item_embeddings.values.astype("float32"))
item_ids_all = item_embeddings.index.tolist()

In [51]:
# Нормализуем item-векторы
faiss.normalize_L2(item_matrix)

In [52]:
# Создаем FAISS index
index = faiss.IndexFlatIP(item_matrix.shape[1])
index.add(item_matrix)

In [53]:
# Словарь: user_id → список положительных item_id
positive_items_by_user = (
    df_train_interaction_merged[df_train_interaction_merged["label"] == 1]
    .groupby("user_id")["item_id"]
    .apply(set)
    .to_dict()
)

In [54]:
# Выбираем только пользователей с положительными примерами
selected_users = list(positive_users_interaction)

In [55]:
# Генерация хард-негативов
hard_negatives_interaction = []

In [56]:
for user_id in tqdm(selected_users, desc="Generating hard negatives (FAISS-interaction)", file=sys.stdout):
    pos_items = positive_items_by_user.get(user_id, set())
    
    if len(pos_items) == 0:
        continue

    try:
        pos_vectors = item_embeddings.loc[list(pos_items)].values.astype("float32")
    except KeyError:
        continue

    user_vec = np.mean(pos_vectors, axis=0).reshape(1, -1)
    faiss.normalize_L2(user_vec)

    D, I = index.search(user_vec, 100)

    count = 0
    for idx in I[0]:
        candidate_item = item_ids_all[idx]
        if candidate_item not in pos_items:
            hard_negatives_interaction.append({
                "user_id": user_id,
                "item_id": candidate_item,
                "label": 0
            })
            count += 1
        if count >= n_neg:
            break

Generating hard negatives (FAISS-interaction): 100%|██████████| 611334/611334 [1:13:23<00:00, 138.83it/s]


In [57]:
# Финализируем датафрейм
df_hard_negatives_interaction = pd.DataFrame(hard_negatives_interaction)

In [58]:
# Выбираем позитивы из interaction-merged
df_positive_interaction = df_train_interaction_merged[df_train_interaction_merged["label"] == 1][["user_id", "item_id", "label"]].copy()

In [59]:
# Объединяем
df_train_interaction_hard = pd.concat([df_positive_interaction, df_hard_negatives_interaction], ignore_index=True)

In [60]:
print(f"Финальный датасет (interaction-based): {df_train_interaction_hard.shape}")
print(df_train_interaction_hard['label'].value_counts())

Финальный датасет (interaction-based): (3469091, 3)
label
0    1834002
1    1635089
Name: count, dtype: int64


In [61]:
df_train_interaction_hard.to_csv("data/df_train_interaction_hard.csv", index=False)

## Contrustive learning InfoNCE

In [167]:
df_train_rating_hard = pd.read_csv("data/df_train_rating_hard.csv")
df_train_interaction_hard = pd.read_csv("data/df_train_interaction_hard.csv")

In [168]:
import torch

In [169]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [170]:
# Установка детерминированного поведения для воспроизводимости результатов
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(42)

In [171]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import sys
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

In [172]:
class ContrastiveTripletDataset(torch.utils.data.Dataset):
    def __init__(self, df, df_meta_sample):
        self.df = df
        self.df_meta_sample = df_meta_sample

        # item_id → vector
        self.item_vectors = df_meta_sample.copy()

        # user_id → список положительных item_id
        self.user_pos_items = (
            df[df["label"] == 1]
            .groupby("user_id")["item_id"]
            .apply(list)
            .to_dict()
        )

        # user_id → список отрицательных item_id
        self.user_neg_items = (
            df[df["label"] == 0]
            .groupby("user_id")["item_id"]
            .apply(list)
            .to_dict()
        )

        self.user_ids = [u for u in self.user_pos_items if u in self.user_neg_items]

    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, idx):
        user_id = self.user_ids[idx]

        pos_items = self.user_pos_items[user_id]
        neg_items = self.user_neg_items[user_id]

        # Выбираем положительный и отрицательный item_id
        pos_item_id = random.choice(pos_items)
        neg_item_id = random.choice(neg_items)

        # Строим user_vec как среднее по всем положительным
        user_vec = np.mean([
            self.item_vectors.loc[i].values for i in pos_items if i in self.item_vectors.index
        ], axis=0)

        pos_vec = self.item_vectors.loc[pos_item_id].values
        neg_vec = self.item_vectors.loc[neg_item_id].values

        return (
            torch.tensor(user_vec, dtype=torch.float32),
            torch.tensor(pos_vec, dtype=torch.float32),
            torch.tensor(neg_vec, dtype=torch.float32),
        )

In [173]:
triplet_dataset_rating = ContrastiveTripletDataset(df_train_rating_hard, df_meta_sample)
triplet_dataset_interaction = ContrastiveTripletDataset(df_train_interaction_hard, df_meta_sample)

In [174]:
triplet_loader_rating = DataLoader(triplet_dataset_rating, batch_size=2048, shuffle=True)
triplet_loader_interaction = DataLoader(triplet_dataset_interaction, batch_size=2048, shuffle=True)

In [175]:
class ContrastiveDotModel(nn.Module):
    def __init__(self, embedding_dim):
        super().__init__()
        self.user_projection = nn.Linear(embedding_dim, embedding_dim)
        self.item_projection = nn.Linear(embedding_dim, embedding_dim)

    def forward(self, user_vecs, item_vecs):
        user_proj = nn.functional.normalize(self.user_projection(user_vecs), dim=1)
        item_proj = nn.functional.normalize(self.item_projection(item_vecs), dim=1)
        return user_proj, item_proj

In [176]:
def info_nce_loss(user_proj, pos_proj, neg_proj=None, temperature=0.07):
    # user_proj: (B, D)
    # pos_proj: (B, D)
    # Негативы берём из батча: все pos других юзеров

    # Compute logits: (B, B)
    logits = torch.matmul(user_proj, pos_proj.T) / temperature

    # Labels: i-й пользователь должен совпадать с i-м положительным
    labels = torch.arange(logits.size(0)).to(logits.device)

    loss_fn = nn.CrossEntropyLoss()
    return loss_fn(logits, labels)

In [177]:
def train_contrastive_model(dataloader, embedding_dim, device, epochs=10, lr=1e-3):
    model = ContrastiveDotModel(embedding_dim).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        loop = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}", leave=False)
        for user_vecs, pos_vecs, neg_vecs in loop:
            user_vecs = user_vecs.to(device)
            pos_vecs = pos_vecs.to(device)

            user_proj, pos_proj = model(user_vecs, pos_vecs)
            loss = info_nce_loss(user_proj, pos_proj, temperature=0.07)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            loop.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1}: Avg Loss = {avg_loss:.4f}")

    return model

In [178]:
embedding_dim = df_meta_sample.shape[1]

In [179]:
model_contrastive_rating = train_contrastive_model(
    triplet_loader_rating,
    embedding_dim=embedding_dim,
    device=device,
    epochs=20,
    lr=1e-3
)

                                                                         

Epoch 1: Avg Loss = 1.3520


                                                                         

Epoch 2: Avg Loss = 0.7735


                                                                         

Epoch 3: Avg Loss = 0.7510


                                                                         

Epoch 4: Avg Loss = 0.7405


                                                                         

Epoch 5: Avg Loss = 0.7361


                                                                         

Epoch 6: Avg Loss = 0.7327


                                                                         

Epoch 7: Avg Loss = 0.7299


                                                                         

Epoch 8: Avg Loss = 0.7308


                                                                         

Epoch 9: Avg Loss = 0.7272


                                                                          

Epoch 10: Avg Loss = 0.7275


                                                                          

Epoch 11: Avg Loss = 0.7249


                                                                          

Epoch 12: Avg Loss = 0.7253


                                                                          

Epoch 13: Avg Loss = 0.7256


                                                                          

Epoch 14: Avg Loss = 0.7245


                                                                          

Epoch 15: Avg Loss = 0.7257


                                                                          

Epoch 16: Avg Loss = 0.7253


                                                                          

Epoch 17: Avg Loss = 0.7241


                                                                          

Epoch 18: Avg Loss = 0.7229


                                                                          

Epoch 19: Avg Loss = 0.7234


                                                                          

Epoch 20: Avg Loss = 0.7231




In [181]:
# Сохраняем лучшую модель на диск
torch.save(model_contrastive_rating.state_dict(), "models/contrastive_rating_best.pth")
print("Модель успешно сохранена в models/contrastive_rating_best.pth")

Модель успешно сохранена в models/contrastive_rating_best.pth


In [51]:
model_contrastive_interaction = train_contrastive_model(
    triplet_loader_interaction,
    embedding_dim=embedding_dim,
    device=device,
    epochs=20,
    lr=1e-3
)

                                                                         

Epoch 1: Avg Loss = 1.3801


                                                                         

Epoch 2: Avg Loss = 0.8250


                                                                         

Epoch 3: Avg Loss = 0.8027


                                                                         

Epoch 4: Avg Loss = 0.7909


                                                                         

Epoch 5: Avg Loss = 0.7876


                                                                         

Epoch 6: Avg Loss = 0.7850


                                                                         

Epoch 7: Avg Loss = 0.7830


                                                                         

Epoch 8: Avg Loss = 0.7822


                                                                         

Epoch 9: Avg Loss = 0.7794


                                                                          

Epoch 10: Avg Loss = 0.7787


                                                                          

Epoch 11: Avg Loss = 0.7802


                                                                          

Epoch 12: Avg Loss = 0.7798


                                                                          

Epoch 13: Avg Loss = 0.7793


                                                                          

Epoch 14: Avg Loss = 0.7796


                                                                          

Epoch 15: Avg Loss = 0.7778


                                                                          

Epoch 16: Avg Loss = 0.7776


                                                                          

Epoch 17: Avg Loss = 0.7773


                                                                          

Epoch 18: Avg Loss = 0.7779


                                                                          

Epoch 19: Avg Loss = 0.7762


                                                                          

Epoch 20: Avg Loss = 0.7773




In [52]:
# Получаем эмбеддинги всех item'ов с помощью обученной проекции
def get_projected_item_vectors(model, df_meta_sample, device):
    model.eval()
    item_ids = df_meta_sample.index.tolist()
    item_vecs = torch.tensor(df_meta_sample.values, dtype=torch.float32).to(device)

    with torch.no_grad():
        # Только item-проекция
        item_proj = model.item_projection(item_vecs)
        item_proj = nn.functional.normalize(item_proj, dim=1)

    item_proj = item_proj.cpu().numpy()
    return item_ids, item_proj

In [53]:
# Получаем item_dict заранее
candidate_ids_rating, candidate_embs_rating = get_projected_item_vectors(model_contrastive_rating, df_meta_sample, device)
item_dict_rating = dict(zip(candidate_ids_rating, candidate_embs_rating))

In [54]:
candidate_ids_interaction, candidate_embs_interaction = get_projected_item_vectors(model_contrastive_interaction, df_meta_sample, device)
item_dict_interaction = dict(zip(candidate_ids_interaction, candidate_embs_interaction))

In [55]:
# Упрощённый build_user_vectors_contrastive
def build_user_vectors_contrastive(df_hard, item_dict, ground_truth):
    user_vectors = {}
    for user_id, group in df_hard[df_hard["label"] == 1].groupby("user_id"):
        if user_id not in ground_truth:
            continue
        vectors = [item_dict[iid] for iid in group["item_id"] if iid in item_dict]
        if vectors:
            user_vectors[user_id] = np.mean(vectors, axis=0)
    return user_vectors

In [56]:
# Строим user_vecs без повторной проекции
user_vectors_rating = build_user_vectors_contrastive(df_train_rating_hard, item_dict_rating, ground_truth_rating)
user_vectors_interaction = build_user_vectors_contrastive(df_train_interaction_hard, item_dict_interaction, ground_truth_interaction)

In [57]:
import faiss

In [58]:
# FAISS-индексация и инференс
def run_faiss_inference(user_vectors, item_dict, ground_truth, top_k=10):
    # Подготовка item-векторов
    item_ids = list(item_dict.keys())
    item_matrix = np.stack([item_dict[iid] for iid in item_ids]).astype("float32")

    # FAISS index (нормализованный для cosine similarity)
    faiss.normalize_L2(item_matrix)
    index = faiss.IndexFlatIP(item_matrix.shape[1])
    index.add(item_matrix)

    preds = {}
    for user_id, user_vec in tqdm(user_vectors.items(), desc="FAISS inference"):
        user_vec = user_vec.astype("float32").reshape(1, -1)
        faiss.normalize_L2(user_vec)

        D, I = index.search(user_vec, top_k)
        top_items = [item_ids[i] for i in I[0]]
        preds[user_id] = top_items

    return preds

In [59]:
# Расчёт метрик
def precision_at_k(preds, ground_truth, k=10):
    scores = []
    for user, pred_items in preds.items():
        if user not in ground_truth:
            continue
        hits = sum([1 for item in pred_items[:k] if item in ground_truth[user]])
        scores.append(hits / k)
    return round(np.mean(scores), 4)

def apk(pred, actual, k=10):
    if not actual:
        return 0.0
    pred = pred[:k]
    score, num_hits = 0.0, 0.0
    for i, p in enumerate(pred):
        if p in actual and p not in pred[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    return score / min(len(actual), k)

def map_at_k(preds, ground_truth, k=10):
    return round(np.mean([
        apk(preds[u], ground_truth[u], k)
        for u in preds if u in ground_truth
    ]), 4)

In [60]:
# Запуск инференса и оценка
# Rating-based
preds_rating = run_faiss_inference(user_vectors_rating, item_dict_rating, ground_truth_rating, top_k=10)
metrics_rating = {
    "precision@5": precision_at_k(preds_rating, ground_truth_rating, k=5),
    "map@5": map_at_k(preds_rating, ground_truth_rating, k=5),
    "precision@10": precision_at_k(preds_rating, ground_truth_rating, k=10),
    "map@10": map_at_k(preds_rating, ground_truth_rating, k=10),
}
print("\n[Rating-based Contrastive Model]")
for k, v in metrics_rating.items():
    print(f"{k}: {v}")

FAISS inference: 100%|██████████| 72249/72249 [08:18<00:00, 144.82it/s]



[Rating-based Contrastive Model]
precision@5: 0.004
map@5: 0.0066
precision@10: 0.003
map@10: 0.0075


In [61]:
# Interaction-based
preds_interaction = run_faiss_inference(user_vectors_interaction, item_dict_interaction, ground_truth_interaction, top_k=10)
metrics_interaction = {
    "precision@5": precision_at_k(preds_interaction, ground_truth_interaction, k=5),
    "map@5": map_at_k(preds_interaction, ground_truth_interaction, k=5),
    "precision@10": precision_at_k(preds_interaction, ground_truth_interaction, k=10),
    "map@10": map_at_k(preds_interaction, ground_truth_interaction, k=10),
}
print("\n[Interaction-based Contrastive Model]")
for k, v in metrics_interaction.items():
    print(f"{k}: {v}")

FAISS inference: 100%|██████████| 94443/94443 [10:52<00:00, 144.82it/s]



[Interaction-based Contrastive Model]
precision@5: 0.0035
map@5: 0.0056
precision@10: 0.0027
map@10: 0.0064


## Использование user projection в инференсе

In [62]:
def build_user_vectors_with_projection(df_hard, df_meta_sample, model, device, ground_truth):
    model.eval()
    user_vectors = {}

    for user_id, group in df_hard[df_hard["label"] == 1].groupby("user_id"):
        if user_id not in ground_truth:
            continue

        # Сырые item-вектора, НЕ проецированные
        item_vecs = [
            df_meta_sample.loc[iid].values
            for iid in group["item_id"]
            if iid in df_meta_sample.index
        ]
        if not item_vecs:
            continue

        user_vec = np.mean(item_vecs, axis=0)
        user_vec_tensor = torch.tensor(user_vec, dtype=torch.float32).unsqueeze(0).to(device)

        with torch.no_grad():
            user_proj = model.user_projection(user_vec_tensor)
            user_proj = nn.functional.normalize(user_proj, dim=1)

        user_vectors[user_id] = user_proj.cpu().numpy().flatten()

    return user_vectors


In [63]:
# Rating-based user projection
user_vectors_rating_proj = build_user_vectors_with_projection(
    df_train_rating_hard, df_meta_sample, model_contrastive_rating, device, ground_truth_rating
)

In [64]:
# Interaction-based user projection
user_vectors_interaction_proj = build_user_vectors_with_projection(
    df_train_interaction_hard, df_meta_sample, model_contrastive_interaction, device, ground_truth_interaction
)

In [65]:
# Rating
preds_rating_proj = run_faiss_inference(user_vectors_rating_proj, item_dict_rating, ground_truth_rating)
metrics_rating_proj = {
    "precision@5": precision_at_k(preds_rating_proj, ground_truth_rating, k=5),
    "map@5": map_at_k(preds_rating_proj, ground_truth_rating, k=5),
    "precision@10": precision_at_k(preds_rating_proj, ground_truth_rating, k=10),
    "map@10": map_at_k(preds_rating_proj, ground_truth_rating, k=10),
}
print("\n[Rating-based Contrastive Model w/ User Projection]")
for k, v in metrics_rating_proj.items():
    print(f"{k}: {v}")

FAISS inference: 100%|██████████| 72249/72249 [08:15<00:00, 145.93it/s]



[Rating-based Contrastive Model w/ User Projection]
precision@5: 0.0039
map@5: 0.0063
precision@10: 0.0029
map@10: 0.0072


In [66]:
# Interaction
preds_interaction_proj = run_faiss_inference(user_vectors_interaction_proj, item_dict_interaction, ground_truth_interaction)
metrics_interaction_proj = {
    "precision@5": precision_at_k(preds_interaction_proj, ground_truth_interaction, k=5),
    "map@5": map_at_k(preds_interaction_proj, ground_truth_interaction, k=5),
    "precision@10": precision_at_k(preds_interaction_proj, ground_truth_interaction, k=10),
    "map@10": map_at_k(preds_interaction_proj, ground_truth_interaction, k=10),
}
print("\n[Interaction-based Contrastive Model w/ User Projection]")
for k, v in metrics_interaction_proj.items():
    print(f"{k}: {v}")

FAISS inference: 100%|██████████| 94443/94443 [10:40<00:00, 147.34it/s]



[Interaction-based Contrastive Model w/ User Projection]
precision@5: 0.0034
map@5: 0.0054
precision@10: 0.0025
map@10: 0.0062


## Contrustive learning без слоев

In [67]:
df_train_rating_hard = pd.read_csv("data/df_train_rating_hard.csv")
df_train_interaction_hard = pd.read_csv("data/df_train_interaction_hard.csv")

In [68]:
import torch

In [69]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [70]:
# Установка детерминированного поведения для воспроизводимости результатов
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(42)

In [71]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import sys
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

In [72]:
class ContrastiveTripletDataset(torch.utils.data.Dataset):
    def __init__(self, df, df_meta_sample):
        self.df = df
        self.df_meta_sample = df_meta_sample

        # item_id → vector
        self.item_vectors = df_meta_sample.copy()

        # user_id → список положительных item_id
        self.user_pos_items = (
            df[df["label"] == 1]
            .groupby("user_id")["item_id"]
            .apply(list)
            .to_dict()
        )

        # user_id → список отрицательных item_id
        self.user_neg_items = (
            df[df["label"] == 0]
            .groupby("user_id")["item_id"]
            .apply(list)
            .to_dict()
        )

        self.user_ids = [u for u in self.user_pos_items if u in self.user_neg_items]

    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, idx):
        user_id = self.user_ids[idx]

        pos_items = self.user_pos_items[user_id]
        neg_items = self.user_neg_items[user_id]

        # Выбираем положительный и отрицательный item_id
        pos_item_id = random.choice(pos_items)
        neg_item_id = random.choice(neg_items)

        # Строим user_vec как среднее по всем положительным
        user_vec = np.mean([
            self.item_vectors.loc[i].values for i in pos_items if i in self.item_vectors.index
        ], axis=0)

        pos_vec = self.item_vectors.loc[pos_item_id].values
        neg_vec = self.item_vectors.loc[neg_item_id].values

        return (
            torch.tensor(user_vec, dtype=torch.float32),
            torch.tensor(pos_vec, dtype=torch.float32),
            torch.tensor(neg_vec, dtype=torch.float32),
        )

In [73]:
triplet_dataset_rating = ContrastiveTripletDataset(df_train_rating_hard, df_meta_sample)
triplet_dataset_interaction = ContrastiveTripletDataset(df_train_interaction_hard, df_meta_sample)

In [74]:
triplet_loader_rating = DataLoader(triplet_dataset_rating, batch_size=2048, shuffle=True)
triplet_loader_interaction = DataLoader(triplet_dataset_interaction, batch_size=2048, shuffle=True)

In [75]:
class ContrastiveSimpleModel(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, user_vecs, item_vecs):
        user_proj = nn.functional.normalize(user_vecs, dim=1)
        item_proj = nn.functional.normalize(item_vecs, dim=1)
        return user_proj, item_proj

In [76]:
def info_nce_loss(user_proj, pos_proj, neg_proj=None, temperature=0.07):
    # user_proj: (B, D)
    # pos_proj: (B, D)
    # Негативы берём из батча: все pos других юзеров

    # Compute logits: (B, B)
    logits = torch.matmul(user_proj, pos_proj.T) / temperature

    # Labels: i-й пользователь должен совпадать с i-м положительным
    labels = torch.arange(logits.size(0)).to(logits.device)

    loss_fn = nn.CrossEntropyLoss()
    return loss_fn(logits, labels)

In [80]:
def train_contrastive_model(dataloader, embedding_dim, device, epochs=10, lr=1e-3):
    model = ContrastiveSimpleModel().to(device)

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        loop = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}", leave=False)
        for user_vecs, pos_vecs, neg_vecs in loop:
            user_vecs = user_vecs.to(device)
            pos_vecs = pos_vecs.to(device)

            user_proj, pos_proj = model(user_vecs, pos_vecs)
            loss = info_nce_loss(user_proj, pos_proj, temperature=0.07)

            # ↓ Без градиентов и оптимизации
            # optimizer.zero_grad()
            # loss.backward()
            # optimizer.step()

            total_loss += loss.item()
            loop.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1}: Avg Loss = {avg_loss:.4f}")

    return model

In [81]:
embedding_dim = df_meta_sample.shape[1]

In [82]:
model_contrastive_rating = train_contrastive_model(
    triplet_loader_rating,
    embedding_dim=embedding_dim,
    device=device,
    epochs=1,
    lr=1e-3
)

                                                                        

Epoch 1: Avg Loss = 1.0443




In [83]:
model_contrastive_interaction = train_contrastive_model(
    triplet_loader_interaction,
    embedding_dim=embedding_dim,
    device=device,
    epochs=1,
    lr=1e-3
)

                                                                        

Epoch 1: Avg Loss = 1.1305




In [84]:
# Получаем эмбеддинги всех item'ов с помощью обученной проекции
def get_projected_item_vectors(df_meta_sample, device):
    item_ids = df_meta_sample.index.tolist()
    item_vecs = torch.tensor(df_meta_sample.values, dtype=torch.float32).to(device)

    with torch.no_grad():
        item_proj = nn.functional.normalize(item_vecs, dim=1)

    item_proj = item_proj.cpu().numpy()
    return item_ids, item_proj

In [86]:
# Получаем item_dict заранее
candidate_ids_rating, candidate_embs_rating = get_projected_item_vectors(df_meta_sample, device)
item_dict_rating = dict(zip(candidate_ids_rating, candidate_embs_rating))

In [87]:
candidate_ids_interaction, candidate_embs_interaction = get_projected_item_vectors(df_meta_sample, device)
item_dict_interaction = dict(zip(candidate_ids_interaction, candidate_embs_interaction))

In [88]:
# Упрощённый build_user_vectors_contrastive
def build_user_vectors_contrastive(df_hard, item_dict, ground_truth):
    user_vectors = {}
    for user_id, group in df_hard[df_hard["label"] == 1].groupby("user_id"):
        if user_id not in ground_truth:
            continue
        vectors = [item_dict[iid] for iid in group["item_id"] if iid in item_dict]
        if vectors:
            user_vectors[user_id] = np.mean(vectors, axis=0)
    return user_vectors

In [89]:
# Строим user_vecs без повторной проекции
user_vectors_rating = build_user_vectors_contrastive(df_train_rating_hard, item_dict_rating, ground_truth_rating)
user_vectors_interaction = build_user_vectors_contrastive(df_train_interaction_hard, item_dict_interaction, ground_truth_interaction)

In [90]:
import faiss

In [91]:
# FAISS-индексация и инференс
def run_faiss_inference(user_vectors, item_dict, ground_truth, top_k=10):
    # Подготовка item-векторов
    item_ids = list(item_dict.keys())
    item_matrix = np.stack([item_dict[iid] for iid in item_ids]).astype("float32")

    # FAISS index (нормализованный для cosine similarity)
    faiss.normalize_L2(item_matrix)
    index = faiss.IndexFlatIP(item_matrix.shape[1])
    index.add(item_matrix)

    preds = {}
    for user_id, user_vec in tqdm(user_vectors.items(), desc="FAISS inference"):
        user_vec = user_vec.astype("float32").reshape(1, -1)
        faiss.normalize_L2(user_vec)

        D, I = index.search(user_vec, top_k)
        top_items = [item_ids[i] for i in I[0]]
        preds[user_id] = top_items

    return preds

In [92]:
# Расчёт метрик
def precision_at_k(preds, ground_truth, k=10):
    scores = []
    for user, pred_items in preds.items():
        if user not in ground_truth:
            continue
        hits = sum([1 for item in pred_items[:k] if item in ground_truth[user]])
        scores.append(hits / k)
    return round(np.mean(scores), 4)

def apk(pred, actual, k=10):
    if not actual:
        return 0.0
    pred = pred[:k]
    score, num_hits = 0.0, 0.0
    for i, p in enumerate(pred):
        if p in actual and p not in pred[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    return score / min(len(actual), k)

def map_at_k(preds, ground_truth, k=10):
    return round(np.mean([
        apk(preds[u], ground_truth[u], k)
        for u in preds if u in ground_truth
    ]), 4)

In [93]:
# Запуск инференса и оценка
# Rating-based
preds_rating = run_faiss_inference(user_vectors_rating, item_dict_rating, ground_truth_rating, top_k=10)
metrics_rating = {
    "precision@5": precision_at_k(preds_rating, ground_truth_rating, k=5),
    "map@5": map_at_k(preds_rating, ground_truth_rating, k=5),
    "precision@10": precision_at_k(preds_rating, ground_truth_rating, k=10),
    "map@10": map_at_k(preds_rating, ground_truth_rating, k=10),
}
print("\n[Rating-based Contrastive Model]")
for k, v in metrics_rating.items():
    print(f"{k}: {v}")

FAISS inference: 100%|██████████| 72249/72249 [08:00<00:00, 150.21it/s]



[Rating-based Contrastive Model]
precision@5: 0.0039
map@5: 0.0065
precision@10: 0.0028
map@10: 0.0073


In [94]:
# Interaction-based
preds_interaction = run_faiss_inference(user_vectors_interaction, item_dict_interaction, ground_truth_interaction, top_k=10)
metrics_interaction = {
    "precision@5": precision_at_k(preds_interaction, ground_truth_interaction, k=5),
    "map@5": map_at_k(preds_interaction, ground_truth_interaction, k=5),
    "precision@10": precision_at_k(preds_interaction, ground_truth_interaction, k=10),
    "map@10": map_at_k(preds_interaction, ground_truth_interaction, k=10),
}
print("\n[Interaction-based Contrastive Model]")
for k, v in metrics_interaction.items():
    print(f"{k}: {v}")

FAISS inference: 100%|██████████| 94443/94443 [10:25<00:00, 150.89it/s]



[Interaction-based Contrastive Model]
precision@5: 0.0034
map@5: 0.0056
precision@10: 0.0025
map@10: 0.0062


## Использование user projection в инференсе

In [95]:
def build_user_vectors_with_projection(df_hard, df_meta_sample, ground_truth):
    user_vectors = {}

    for user_id, group in df_hard[df_hard["label"] == 1].groupby("user_id"):
        if user_id not in ground_truth:
            continue

        item_vecs = [
            df_meta_sample.loc[iid].values
            for iid in group["item_id"]
            if iid in df_meta_sample.index
        ]
        if not item_vecs:
            continue

        user_vec = np.mean(item_vecs, axis=0)
        user_vec = user_vec / np.linalg.norm(user_vec)  # L2-нормализация

        user_vectors[user_id] = user_vec

    return user_vectors


In [96]:
# Rating-based user projection
user_vectors_rating_proj = build_user_vectors_with_projection(
    df_train_rating_hard, df_meta_sample, ground_truth_rating
)

In [97]:
# Interaction-based user projection
user_vectors_interaction_proj = build_user_vectors_with_projection(
    df_train_interaction_hard, df_meta_sample, ground_truth_interaction
)

In [98]:
# Rating
preds_rating_proj = run_faiss_inference(user_vectors_rating_proj, item_dict_rating, ground_truth_rating)
metrics_rating_proj = {
    "precision@5": precision_at_k(preds_rating_proj, ground_truth_rating, k=5),
    "map@5": map_at_k(preds_rating_proj, ground_truth_rating, k=5),
    "precision@10": precision_at_k(preds_rating_proj, ground_truth_rating, k=10),
    "map@10": map_at_k(preds_rating_proj, ground_truth_rating, k=10),
}
print("\n[Rating-based Contrastive Model w/ User Projection]")
for k, v in metrics_rating_proj.items():
    print(f"{k}: {v}")

FAISS inference: 100%|██████████| 72249/72249 [07:54<00:00, 152.28it/s]



[Rating-based Contrastive Model w/ User Projection]
precision@5: 0.0039
map@5: 0.0064
precision@10: 0.0028
map@10: 0.0072


In [99]:
# Interaction
preds_interaction_proj = run_faiss_inference(user_vectors_interaction_proj, item_dict_interaction, ground_truth_interaction)
metrics_interaction_proj = {
    "precision@5": precision_at_k(preds_interaction_proj, ground_truth_interaction, k=5),
    "map@5": map_at_k(preds_interaction_proj, ground_truth_interaction, k=5),
    "precision@10": precision_at_k(preds_interaction_proj, ground_truth_interaction, k=10),
    "map@10": map_at_k(preds_interaction_proj, ground_truth_interaction, k=10),
}
print("\n[Interaction-based Contrastive Model w/ User Projection]")
for k, v in metrics_interaction_proj.items():
    print(f"{k}: {v}")

FAISS inference: 100%|██████████| 94443/94443 [10:32<00:00, 149.37it/s]



[Interaction-based Contrastive Model w/ User Projection]
precision@5: 0.0034
map@5: 0.0056
precision@10: 0.0025
map@10: 0.0062


## Contrustive learning TripletLoss

In [100]:
df_train_rating_hard = pd.read_csv("data/df_train_rating_hard.csv")
df_train_interaction_hard = pd.read_csv("data/df_train_interaction_hard.csv")

In [101]:
import torch

In [102]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [103]:
# Установка детерминированного поведения для воспроизводимости результатов
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(42)

In [104]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import sys
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

In [105]:
class ContrastiveTripletDataset(torch.utils.data.Dataset):
    def __init__(self, df, df_meta_sample):
        self.df = df
        self.df_meta_sample = df_meta_sample

        # item_id → vector
        self.item_vectors = df_meta_sample.copy()

        # user_id → список положительных item_id
        self.user_pos_items = (
            df[df["label"] == 1]
            .groupby("user_id")["item_id"]
            .apply(list)
            .to_dict()
        )

        # user_id → список отрицательных item_id
        self.user_neg_items = (
            df[df["label"] == 0]
            .groupby("user_id")["item_id"]
            .apply(list)
            .to_dict()
        )

        self.user_ids = [u for u in self.user_pos_items if u in self.user_neg_items]

    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, idx):
        user_id = self.user_ids[idx]

        pos_items = self.user_pos_items[user_id]
        neg_items = self.user_neg_items[user_id]

        # Выбираем положительный и отрицательный item_id
        pos_item_id = random.choice(pos_items)
        neg_item_id = random.choice(neg_items)

        # Строим user_vec как среднее по всем положительным
        user_vec = np.mean([
            self.item_vectors.loc[i].values for i in pos_items if i in self.item_vectors.index
        ], axis=0)

        pos_vec = self.item_vectors.loc[pos_item_id].values
        neg_vec = self.item_vectors.loc[neg_item_id].values

        return (
            torch.tensor(user_vec, dtype=torch.float32),
            torch.tensor(pos_vec, dtype=torch.float32),
            torch.tensor(neg_vec, dtype=torch.float32),
        )

In [106]:
triplet_dataset_rating = ContrastiveTripletDataset(df_train_rating_hard, df_meta_sample)
triplet_dataset_interaction = ContrastiveTripletDataset(df_train_interaction_hard, df_meta_sample)

In [107]:
triplet_loader_rating = DataLoader(triplet_dataset_rating, batch_size=2048, shuffle=True)
triplet_loader_interaction = DataLoader(triplet_dataset_interaction, batch_size=2048, shuffle=True)

In [108]:
class TripletDotModel(nn.Module):
    def __init__(self, embedding_dim):
        super().__init__()
        self.user_projection = nn.Linear(embedding_dim, embedding_dim)
        self.item_projection = nn.Linear(embedding_dim, embedding_dim)

    def forward(self, user_vecs, pos_vecs, neg_vecs):
        user_proj = nn.functional.normalize(self.user_projection(user_vecs), dim=1)
        pos_proj = nn.functional.normalize(self.item_projection(pos_vecs), dim=1)
        neg_proj = nn.functional.normalize(self.item_projection(neg_vecs), dim=1)
        return user_proj, pos_proj, neg_proj

In [109]:
def triplet_loss(user_proj, pos_proj, neg_proj, margin=0.2):
    pos_scores = (user_proj * pos_proj).sum(dim=1)
    neg_scores = (user_proj * neg_proj).sum(dim=1)
    losses = torch.relu(neg_scores - pos_scores + margin)
    return losses.mean()

In [110]:
def train_triplet_model(dataloader, embedding_dim, device, epochs=10, lr=1e-3):
    model = TripletDotModel(embedding_dim).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        loop = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}", leave=False)
        for user_vecs, pos_vecs, neg_vecs in loop:
            user_vecs = user_vecs.to(device)
            pos_vecs = pos_vecs.to(device)
            neg_vecs = neg_vecs.to(device)

            user_proj, pos_proj, neg_proj = model(user_vecs, pos_vecs, neg_vecs)
            loss = triplet_loss(user_proj, pos_proj, neg_proj)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            loop.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1}: Avg Loss = {avg_loss:.4f}")

    return model

In [111]:
embedding_dim = df_meta_sample.shape[1]

In [112]:
model_triplet_rating = train_triplet_model(
    triplet_loader_rating,
    embedding_dim=embedding_dim,
    device=device,
    epochs=20,
    lr=1e-3
)

                                                                          

Epoch 1: Avg Loss = 0.0532


                                                                          

Epoch 2: Avg Loss = 0.0250


                                                                          

Epoch 3: Avg Loss = 0.0227


                                                                          

Epoch 4: Avg Loss = 0.0218


                                                                          

Epoch 5: Avg Loss = 0.0213


                                                                          

Epoch 6: Avg Loss = 0.0212


                                                                          

Epoch 7: Avg Loss = 0.0210


                                                                          

Epoch 8: Avg Loss = 0.0209


                                                                          

Epoch 9: Avg Loss = 0.0208


                                                                           

Epoch 10: Avg Loss = 0.0208


                                                                           

Epoch 11: Avg Loss = 0.0207


                                                                           

Epoch 12: Avg Loss = 0.0206


                                                                           

Epoch 13: Avg Loss = 0.0207


                                                                           

Epoch 14: Avg Loss = 0.0206


                                                                           

Epoch 15: Avg Loss = 0.0205


                                                                           

Epoch 16: Avg Loss = 0.0205


                                                                           

Epoch 17: Avg Loss = 0.0205


                                                                           

Epoch 18: Avg Loss = 0.0203


                                                                           

Epoch 19: Avg Loss = 0.0204


                                                                           

Epoch 20: Avg Loss = 0.0203




In [113]:
model_triplet_interaction = train_triplet_model(
    triplet_loader_interaction,
    embedding_dim=embedding_dim,
    device=device,
    epochs=20,
    lr=1e-3
)

                                                                          

Epoch 1: Avg Loss = 0.0537


                                                                          

Epoch 2: Avg Loss = 0.0266


                                                                          

Epoch 3: Avg Loss = 0.0244


                                                                          

Epoch 4: Avg Loss = 0.0238


                                                                          

Epoch 5: Avg Loss = 0.0234


                                                                          

Epoch 6: Avg Loss = 0.0232


                                                                          

Epoch 7: Avg Loss = 0.0229


                                                                          

Epoch 8: Avg Loss = 0.0229


                                                                          

Epoch 9: Avg Loss = 0.0228


                                                                           

Epoch 10: Avg Loss = 0.0228


                                                                           

Epoch 11: Avg Loss = 0.0226


                                                                           

Epoch 12: Avg Loss = 0.0225


                                                                           

Epoch 13: Avg Loss = 0.0225


                                                                           

Epoch 14: Avg Loss = 0.0225


                                                                           

Epoch 15: Avg Loss = 0.0224


                                                                           

Epoch 16: Avg Loss = 0.0227


                                                                           

Epoch 17: Avg Loss = 0.0225


                                                                           

Epoch 18: Avg Loss = 0.0225


                                                                           

Epoch 19: Avg Loss = 0.0224


                                                                           

Epoch 20: Avg Loss = 0.0223




In [114]:
# Получаем эмбеддинги всех item'ов с помощью обученной проекции
def get_projected_item_vectors(model, df_meta_sample, device):
    model.eval()
    item_ids = df_meta_sample.index.tolist()
    item_vecs = torch.tensor(df_meta_sample.values, dtype=torch.float32).to(device)

    with torch.no_grad():
        # Только item-проекция
        item_proj = model.item_projection(item_vecs)
        item_proj = nn.functional.normalize(item_proj, dim=1)

    item_proj = item_proj.cpu().numpy()
    return item_ids, item_proj

In [115]:
# Получаем item_dict заранее
candidate_ids_rating, candidate_embs_rating = get_projected_item_vectors(model_triplet_rating, df_meta_sample, device)
item_dict_rating = dict(zip(candidate_ids_rating, candidate_embs_rating))

In [116]:
candidate_ids_interaction, candidate_embs_interaction = get_projected_item_vectors(model_triplet_interaction, df_meta_sample, device)
item_dict_interaction = dict(zip(candidate_ids_interaction, candidate_embs_interaction))

In [117]:
# Упрощённый build_user_vectors_contrastive
def build_user_vectors_contrastive(df_hard, item_dict, ground_truth):
    user_vectors = {}
    for user_id, group in df_hard[df_hard["label"] == 1].groupby("user_id"):
        if user_id not in ground_truth:
            continue
        vectors = [item_dict[iid] for iid in group["item_id"] if iid in item_dict]
        if vectors:
            user_vectors[user_id] = np.mean(vectors, axis=0)
    return user_vectors

In [118]:
# Строим user_vecs без повторной проекции
user_vectors_rating = build_user_vectors_contrastive(df_train_rating_hard, item_dict_rating, ground_truth_rating)
user_vectors_interaction = build_user_vectors_contrastive(df_train_interaction_hard, item_dict_interaction, ground_truth_interaction)

In [119]:
import faiss

In [120]:
# FAISS-индексация и инференс
def run_faiss_inference(user_vectors, item_dict, ground_truth, top_k=10):
    # Подготовка item-векторов
    item_ids = list(item_dict.keys())
    item_matrix = np.stack([item_dict[iid] for iid in item_ids]).astype("float32")

    # FAISS index (нормализованный для cosine similarity)
    faiss.normalize_L2(item_matrix)
    index = faiss.IndexFlatIP(item_matrix.shape[1])
    index.add(item_matrix)

    preds = {}
    for user_id, user_vec in tqdm(user_vectors.items(), desc="FAISS inference"):
        user_vec = user_vec.astype("float32").reshape(1, -1)
        faiss.normalize_L2(user_vec)

        D, I = index.search(user_vec, top_k)
        top_items = [item_ids[i] for i in I[0]]
        preds[user_id] = top_items

    return preds

In [121]:
# Расчёт метрик
def precision_at_k(preds, ground_truth, k=10):
    scores = []
    for user, pred_items in preds.items():
        if user not in ground_truth:
            continue
        hits = sum([1 for item in pred_items[:k] if item in ground_truth[user]])
        scores.append(hits / k)
    return round(np.mean(scores), 4)

def apk(pred, actual, k=10):
    if not actual:
        return 0.0
    pred = pred[:k]
    score, num_hits = 0.0, 0.0
    for i, p in enumerate(pred):
        if p in actual and p not in pred[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    return score / min(len(actual), k)

def map_at_k(preds, ground_truth, k=10):
    return round(np.mean([
        apk(preds[u], ground_truth[u], k)
        for u in preds if u in ground_truth
    ]), 4)

In [122]:
# Запуск инференса и оценка
# Rating-based
preds_rating = run_faiss_inference(user_vectors_rating, item_dict_rating, ground_truth_rating, top_k=10)
metrics_rating = {
    "precision@5": precision_at_k(preds_rating, ground_truth_rating, k=5),
    "map@5": map_at_k(preds_rating, ground_truth_rating, k=5),
    "precision@10": precision_at_k(preds_rating, ground_truth_rating, k=10),
    "map@10": map_at_k(preds_rating, ground_truth_rating, k=10),
}
print("\n[Rating-based Triplet Loss Model]")
for k, v in metrics_rating.items():
    print(f"{k}: {v}")

FAISS inference: 100%|██████████| 72249/72249 [08:08<00:00, 147.98it/s]



[Rating-based Triplet Loss Model]
precision@5: 0.0024
map@5: 0.0046
precision@10: 0.0015
map@10: 0.005


In [123]:
# Interaction-based
preds_interaction = run_faiss_inference(user_vectors_interaction, item_dict_interaction, ground_truth_interaction, top_k=10)
metrics_interaction = {
    "precision@5": precision_at_k(preds_interaction, ground_truth_interaction, k=5),
    "map@5": map_at_k(preds_interaction, ground_truth_interaction, k=5),
    "precision@10": precision_at_k(preds_interaction, ground_truth_interaction, k=10),
    "map@10": map_at_k(preds_interaction, ground_truth_interaction, k=10),
}
print("\n[Interaction-based Triplet Loss Model]")
for k, v in metrics_interaction.items():
    print(f"{k}: {v}")

FAISS inference: 100%|██████████| 94443/94443 [10:24<00:00, 151.24it/s]



[Interaction-based Triplet Loss Model]
precision@5: 0.0021
map@5: 0.004
precision@10: 0.0014
map@10: 0.0042


## Использование user projection в инференсе

In [124]:
def build_user_vectors_with_projection(df_hard, df_meta_sample, model, device, ground_truth):
    model.eval()
    user_vectors = {}

    for user_id, group in df_hard[df_hard["label"] == 1].groupby("user_id"):
        if user_id not in ground_truth:
            continue

        # Сырые item-вектора, НЕ проецированные
        item_vecs = [
            df_meta_sample.loc[iid].values
            for iid in group["item_id"]
            if iid in df_meta_sample.index
        ]
        if not item_vecs:
            continue

        user_vec = np.mean(item_vecs, axis=0)
        user_vec_tensor = torch.tensor(user_vec, dtype=torch.float32).unsqueeze(0).to(device)

        with torch.no_grad():
            user_proj = model.user_projection(user_vec_tensor)
            user_proj = nn.functional.normalize(user_proj, dim=1)

        user_vectors[user_id] = user_proj.cpu().numpy().flatten()

    return user_vectors


In [125]:
# Rating-based user projection
user_vectors_rating_proj = build_user_vectors_with_projection(
    df_train_rating_hard, df_meta_sample, model_triplet_rating, device, ground_truth_rating
)

In [126]:
# Interaction-based user projection
user_vectors_interaction_proj = build_user_vectors_with_projection(
    df_train_interaction_hard, df_meta_sample, model_triplet_interaction, device, ground_truth_interaction
)

In [127]:
# Rating
preds_rating_proj = run_faiss_inference(user_vectors_rating_proj, item_dict_rating, ground_truth_rating)
metrics_rating_proj = {
    "precision@5": precision_at_k(preds_rating_proj, ground_truth_rating, k=5),
    "map@5": map_at_k(preds_rating_proj, ground_truth_rating, k=5),
    "precision@10": precision_at_k(preds_rating_proj, ground_truth_rating, k=10),
    "map@10": map_at_k(preds_rating_proj, ground_truth_rating, k=10),
}
print("\n[Rating-based Triplet Loss Model w/ User Projection]")
for k, v in metrics_rating_proj.items():
    print(f"{k}: {v}")

FAISS inference: 100%|██████████| 72249/72249 [07:57<00:00, 151.30it/s]



[Rating-based Triplet Loss Model w/ User Projection]
precision@5: 0.0016
map@5: 0.0034
precision@10: 0.0009
map@10: 0.0035


In [128]:
# Interaction
preds_interaction_proj = run_faiss_inference(user_vectors_interaction_proj, item_dict_interaction, ground_truth_interaction)
metrics_interaction_proj = {
    "precision@5": precision_at_k(preds_interaction_proj, ground_truth_interaction, k=5),
    "map@5": map_at_k(preds_interaction_proj, ground_truth_interaction, k=5),
    "precision@10": precision_at_k(preds_interaction_proj, ground_truth_interaction, k=10),
    "map@10": map_at_k(preds_interaction_proj, ground_truth_interaction, k=10),
}
print("\n[Interaction-based Triplet Loss Model w/ User Projection]")
for k, v in metrics_interaction_proj.items():
    print(f"{k}: {v}")

FAISS inference: 100%|██████████| 94443/94443 [10:24<00:00, 151.35it/s]



[Interaction-based Triplet Loss Model w/ User Projection]
precision@5: 0.0014
map@5: 0.003
precision@10: 0.0008
map@10: 0.0031


## BPRMF

In [142]:
df_train_rating_hard = pd.read_csv("data/df_train_rating_hard.csv")
df_train_interaction_hard = pd.read_csv("data/df_train_interaction_hard.csv")

In [143]:
# Собираем уникальные пользователи и товары
unique_users = pd.concat([
    df_train_rating_hard["user_id"], df_train_interaction_hard["user_id"]
]).unique()
unique_items = pd.concat([
    df_train_rating_hard["item_id"], df_train_interaction_hard["item_id"]
]).unique()

In [144]:
# Словари для индексирования
user2idx = {uid: idx for idx, uid in enumerate(unique_users)}
item2idx = {iid: idx for idx, iid in enumerate(unique_items)}

In [145]:
# Обратные словари можно создать позже при инференсе, если потребуется
num_users = len(user2idx)
num_items = len(item2idx)

In [146]:
import torch

In [147]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [148]:
# Установка детерминированного поведения для воспроизводимости результатов
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(42)

In [149]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import sys
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

In [150]:
class BPRDataset(torch.utils.data.Dataset):
    def __init__(self, df, user2idx, item2idx):
        self.user_pos_items = (
            df[df["label"] == 1]
            .groupby("user_id")["item_id"]
            .apply(set)
            .to_dict()
        )
        self.user_neg_items = (
            df[df["label"] == 0]
            .groupby("user_id")["item_id"]
            .apply(set)
            .to_dict()
        )
        self.user_ids = [u for u in self.user_pos_items if u in self.user_neg_items]
        self.user2idx = user2idx
        self.item2idx = item2idx

    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, idx):
        user_id = self.user_ids[idx]
        pos_item = random.choice(list(self.user_pos_items[user_id]))
        neg_item = random.choice(list(self.user_neg_items[user_id]))

        return (
            torch.tensor(self.user2idx[user_id], dtype=torch.long),
            torch.tensor(self.item2idx[pos_item], dtype=torch.long),
            torch.tensor(self.item2idx[neg_item], dtype=torch.long),
        )

In [151]:
bpr_dataset_rating = BPRDataset(df_train_rating_hard, user2idx, item2idx)
bpr_dataset_interaction = BPRDataset(df_train_interaction_hard, user2idx, item2idx)

In [152]:
bpr_loader_rating = DataLoader(bpr_dataset_rating, batch_size=2048, shuffle=True)
bpr_loader_interaction = DataLoader(bpr_dataset_interaction, batch_size=2048, shuffle=True)

In [153]:
class BPRMF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=64):
        super().__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)

        nn.init.xavier_uniform_(self.user_embedding.weight)
        nn.init.xavier_uniform_(self.item_embedding.weight)

    def forward(self, user_ids, pos_item_ids, neg_item_ids):
        user_vecs = self.user_embedding(user_ids)
        pos_vecs = self.item_embedding(pos_item_ids)
        neg_vecs = self.item_embedding(neg_item_ids)

        pos_scores = (user_vecs * pos_vecs).sum(dim=1)
        neg_scores = (user_vecs * neg_vecs).sum(dim=1)
        return pos_scores, neg_scores

In [154]:
def bpr_loss(pos_scores, neg_scores):
    return -torch.log(torch.sigmoid(pos_scores - neg_scores)).mean()

In [155]:
def train_bpr_model(dataloader, num_users, num_items, device, embedding_dim=64, epochs=10, lr=1e-3):
    model = BPRMF(num_users, num_items, embedding_dim).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        loop = tqdm(dataloader, desc=f"BPRMF Epoch {epoch+1}/{epochs}", leave=False)
        for user_ids, pos_item_ids, neg_item_ids in loop:
            user_ids = user_ids.to(device)
            pos_item_ids = pos_item_ids.to(device)
            neg_item_ids = neg_item_ids.to(device)

            pos_scores, neg_scores = model(user_ids, pos_item_ids, neg_item_ids)
            loss = bpr_loss(pos_scores, neg_scores)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            loop.set_postfix(loss=loss.item())

        print(f"Epoch {epoch+1}: Avg Loss = {total_loss / len(dataloader):.4f}")

    return model


In [156]:
embedding_dim = df_meta_sample.shape[1]

In [157]:
model_bpr_rating = train_bpr_model(
    bpr_loader_rating,
    num_users=len(user2idx),
    num_items=len(item2idx),
    device=device,
    embedding_dim=64,
    epochs=20,
    lr=1e-3
)

                                                                               

Epoch 1: Avg Loss = 0.6931


                                                                               

Epoch 2: Avg Loss = 0.6795


                                                                               

Epoch 3: Avg Loss = 0.5868


                                                                               

Epoch 4: Avg Loss = 0.4446


                                                                               

Epoch 5: Avg Loss = 0.3310


                                                                               

Epoch 6: Avg Loss = 0.2519


                                                                               

Epoch 7: Avg Loss = 0.1958


                                                                               

Epoch 8: Avg Loss = 0.1561


                                                                               

Epoch 9: Avg Loss = 0.1260


                                                                                 

Epoch 10: Avg Loss = 0.1041


                                                                                 

Epoch 11: Avg Loss = 0.0865


                                                                                 

Epoch 12: Avg Loss = 0.0724


                                                                                 

Epoch 13: Avg Loss = 0.0613


                                                                                 

Epoch 14: Avg Loss = 0.0520


                                                                                 

Epoch 15: Avg Loss = 0.0447


                                                                                 

Epoch 16: Avg Loss = 0.0383


                                                                                 

Epoch 17: Avg Loss = 0.0332


                                                                                 

Epoch 18: Avg Loss = 0.0288


                                                                                 

Epoch 19: Avg Loss = 0.0251


                                                                                 

Epoch 20: Avg Loss = 0.0219




In [158]:
model_bpr_interaction = train_bpr_model(
    bpr_loader_interaction,
    num_users=len(user2idx),
    num_items=len(item2idx),
    device=device,
    embedding_dim=64,
    epochs=20,
    lr=1e-3
)

                                                                               

Epoch 1: Avg Loss = 0.6931


                                                                               

Epoch 2: Avg Loss = 0.6790


                                                                               

Epoch 3: Avg Loss = 0.5837


                                                                               

Epoch 4: Avg Loss = 0.4429


                                                                               

Epoch 5: Avg Loss = 0.3315


                                                                               

Epoch 6: Avg Loss = 0.2534


                                                                               

Epoch 7: Avg Loss = 0.1978


                                                                               

Epoch 8: Avg Loss = 0.1577


                                                                               

Epoch 9: Avg Loss = 0.1279


                                                                                 

Epoch 10: Avg Loss = 0.1052


                                                                                 

Epoch 11: Avg Loss = 0.0872


                                                                                 

Epoch 12: Avg Loss = 0.0729


                                                                                 

Epoch 13: Avg Loss = 0.0617


                                                                                 

Epoch 14: Avg Loss = 0.0526


                                                                                 

Epoch 15: Avg Loss = 0.0449


                                                                                 

Epoch 16: Avg Loss = 0.0384


                                                                                 

Epoch 17: Avg Loss = 0.0332


                                                                                 

Epoch 18: Avg Loss = 0.0288


                                                                                 

Epoch 19: Avg Loss = 0.0248


                                                                                 

Epoch 20: Avg Loss = 0.0216




In [159]:
# Получение item-векторов и user-векторов из обученной BPRMF-модели
def extract_user_item_embeddings(model, device):
    model.eval()
    with torch.no_grad():
        user_embeddings = model.user_embedding.weight.detach().cpu().numpy()
        item_embeddings = model.item_embedding.weight.detach().cpu().numpy()
    return user_embeddings, item_embeddings

In [160]:
# Получаем эмбеддинги из модели
user_embs_rating, item_embs_rating = extract_user_item_embeddings(model_bpr_rating, device)

# Создаем словари id → эмбеддинг
item_dict_rating = {item: item_embs_rating[idx] for item, idx in item2idx.items()}
user_vectors_rating = {user: user_embs_rating[idx] for user, idx in user2idx.items()}


In [161]:
# Получаем эмбеддинги из модели
user_embs_interaction, item_embs_interaction = extract_user_item_embeddings(model_bpr_interaction, device)

# Создаем словари id → эмбеддинг
item_dict_interaction = {item: item_embs_interaction[idx] for item, idx in item2idx.items()}
user_vectors_interaction = {user: user_embs_interaction[idx] for user, idx in user2idx.items()}

In [162]:
import faiss

In [163]:
# FAISS-индексация и инференс
def run_faiss_inference(user_vectors, item_dict, ground_truth, top_k=10):
    # Подготовка item-векторов
    item_ids = list(item_dict.keys())
    item_matrix = np.stack([item_dict[iid] for iid in item_ids]).astype("float32")

    # FAISS index (нормализованный для cosine similarity)
    faiss.normalize_L2(item_matrix)
    index = faiss.IndexFlatIP(item_matrix.shape[1])
    index.add(item_matrix)

    preds = {}
    for user_id, user_vec in tqdm(user_vectors.items(), desc="FAISS inference"):
        user_vec = user_vec.astype("float32").reshape(1, -1)
        faiss.normalize_L2(user_vec)

        D, I = index.search(user_vec, top_k)
        top_items = [item_ids[i] for i in I[0]]
        preds[user_id] = top_items

    return preds

In [164]:
# Расчёт метрик
def precision_at_k(preds, ground_truth, k=10):
    scores = []
    for user, pred_items in preds.items():
        if user not in ground_truth:
            continue
        hits = sum([1 for item in pred_items[:k] if item in ground_truth[user]])
        scores.append(hits / k)
    return round(np.mean(scores), 4)

def apk(pred, actual, k=10):
    if not actual:
        return 0.0
    pred = pred[:k]
    score, num_hits = 0.0, 0.0
    for i, p in enumerate(pred):
        if p in actual and p not in pred[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    return score / min(len(actual), k)

def map_at_k(preds, ground_truth, k=10):
    return round(np.mean([
        apk(preds[u], ground_truth[u], k)
        for u in preds if u in ground_truth
    ]), 4)

In [165]:
# Запуск инференса и оценка
# Rating-based
preds_rating = run_faiss_inference(user_vectors_rating, item_dict_rating, ground_truth_rating, top_k=10)
metrics_rating = {
    "precision@5": precision_at_k(preds_rating, ground_truth_rating, k=5),
    "map@5": map_at_k(preds_rating, ground_truth_rating, k=5),
    "precision@10": precision_at_k(preds_rating, ground_truth_rating, k=10),
    "map@10": map_at_k(preds_rating, ground_truth_rating, k=10),
}
print("\n[Rating-based BPRMF Model]")
for k, v in metrics_rating.items():
    print(f"{k}: {v}")

FAISS inference: 100%|██████████| 611334/611334 [18:30<00:00, 550.37it/s]



[Rating-based BPRMF Model]
precision@5: 0.0009
map@5: 0.0021
precision@10: 0.0005
map@10: 0.0022


In [166]:
# Interaction-based
preds_interaction = run_faiss_inference(user_vectors_interaction, item_dict_interaction, ground_truth_interaction, top_k=10)
metrics_interaction = {
    "precision@5": precision_at_k(preds_interaction, ground_truth_interaction, k=5),
    "map@5": map_at_k(preds_interaction, ground_truth_interaction, k=5),
    "precision@10": precision_at_k(preds_interaction, ground_truth_interaction, k=10),
    "map@10": map_at_k(preds_interaction, ground_truth_interaction, k=10),
}
print("\n[Interaction-based BPRMF Model]")
for k, v in metrics_interaction.items():
    print(f"{k}: {v}")

FAISS inference: 100%|██████████| 611334/611334 [18:54<00:00, 538.87it/s]



[Interaction-based BPRMF Model]
precision@5: 0.0007
map@5: 0.0019
precision@10: 0.0004
map@10: 0.0019
