In [1]:
# Импортируем основные библиотеки для работы с данными и Torch
import pandas as pd
import numpy as np
import random
import torch
from tqdm import tqdm
import sys
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import itertools

In [2]:
# Установка детерминированного поведения для воспроизводимости результатов
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(42)

In [3]:
# Загружаем тестовый датасет и обучающую выборку (rating-based sampling)
# для последующего формирования ground truth
df_test = pd.read_csv("data/df_test_ground_truth_rating_based.csv")

In [4]:
# Создаём словарь ground truth: реальные товары, 
# с которыми взаимодействовал пользователь
ground_truth = df_test.groupby("user_id")["item_id"].apply(set).to_dict()

In [5]:
# Функция Precision@K — средняя доля релевантных товаров среди top-K рекомендаций
def precision_at_k(preds, ground_truth, k=10):
    scores = []
    for user, pred_items in preds.items():
        if user not in ground_truth:
            continue
        gt_items = ground_truth[user]
        hits = sum([1 for item in pred_items[:k] if item in gt_items])
        scores.append(hits / k)
    return round(np.mean(scores), 4)
# Функция average precision для одного пользователя
def apk(pred, actual, k=10):
    if not actual:
        return 0.0
    pred = pred[:k]
    score, num_hits = 0.0, 0.0
    for i, p in enumerate(pred):
        if p in actual and p not in pred[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    return score / min(len(actual), k)
# Средняя average precision по всем пользователям
def map_at_k(preds, ground_truth, k=10):
    return round(
        np.mean([
            apk(preds[u], ground_truth[u], k)
            for u in preds if u in ground_truth
        ]),
        4
    )

In [6]:
# Загружаем тренировочную выборку с CLIP-эмбеддингами (текст + изображение)
df_train = pd.read_csv(
    "data/df_train_CLIP_rating_based.csv",
    na_values=[""],  # исключаем "Unknown"
    keep_default_na=False
)

In [7]:
# Выбираем CLIP-эмбеддинги (текст + изображение)
item_vector_cols = [col for col in df_train.columns if col.startswith("clip_text_") or col.startswith("clip_img_")]
user_ids = df_train["user_id"].unique()

In [8]:
# Извлекаем CLIP-эмбеддинги
clip_vectors  = df_train.drop_duplicates("item_id")[["item_id"] + item_vector_cols].set_index("item_id")

In [9]:
item_vectors = clip_vectors.copy()

In [10]:
print("clip_vectors shape:", clip_vectors.shape)
print("item_vectors shape:", item_vectors.shape)

clip_vectors shape: (67706, 200)
item_vectors shape: (67706, 200)


In [11]:
# Формируем вектор интересов пользователя как среднее по позитивным item-векторам
user_vectors = {}

In [12]:
for user_id, group in tqdm(df_train[df_train["label"] == 1].groupby("user_id"), desc="User vector aggregation", file=sys.stdout):
    item_ids = group["item_id"].values
    vectors = item_vectors.loc[item_ids].values
    user_vectors[user_id] = np.mean(vectors, axis=0)

User vector aggregation: 100%|██████████| 551853/551853 [02:27<00:00, 3743.57it/s]


In [13]:
# Строим датасет: конкатенируем векторы пользователя и товара
X = []
y = []

In [14]:
# Формируем обучающие пары
for row in tqdm(df_train.itertuples(), total=len(df_train), desc="Building training pairs", file=sys.stdout):
    item_id = row.item_id
    user_id = row.user_id
    label = row.label

    if user_id not in user_vectors or item_id not in item_vectors.index:
        continue

    user_vec = user_vectors[user_id]
    item_vec = item_vectors.loc[item_id].values

    concat_vec = np.concatenate([user_vec, item_vec])
    X.append(concat_vec)
    y.append(label)

Building training pairs: 100%|██████████| 1635089/1635089 [01:18<00:00, 20901.68it/s]


In [15]:
# Преобразуем списки в массивы для дальнейшей подачи в PyTorch
X = np.array(X)
y = np.array(y)
print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (1542533, 400)
y shape: (1542533,)


In [16]:
# Создадим пользовательский датасет
# в него подаем раздельные вектора юзера и товара
class MatchingDataset(Dataset):
    def __init__(self, user_vecs, item_vecs, labels):
        self.user_vecs = torch.tensor(user_vecs, dtype=torch.float32)
        self.item_vecs = torch.tensor(item_vecs, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.user_vecs[idx], self.item_vecs[idx], self.labels[idx]


In [17]:
# Сплит X на вектора юзера и товара
user_dim = item_dim = X.shape[1] // 2
X_user = X[:, :user_dim]
X_item = X[:, user_dim:]

In [18]:
# train-test split
X_user_train, X_user_val, X_item_train, X_item_val, y_train_np, y_val_np = train_test_split(
    X_user, X_item, y, test_size=0.1, random_state=42, stratify=y
)

In [19]:
# Учет дисбаланса классов
n_pos = np.sum(y == 1)
n_neg = np.sum(y == 0)
pos_weight = (n_neg + n_pos) / (2 * n_pos)
neg_weight = (n_neg + n_pos) / (2 * n_neg)

In [20]:
train_dataset = MatchingDataset(X_user_train, X_item_train, y_train_np)
val_dataset = MatchingDataset(X_user_val, X_item_val, y_val_np)

In [21]:
grid_config = {
    "init_scale": [1.0, 2.0],
    "optimizer": ["Adam", "AdamW", "SGD"],
    "learning_rate": [1e-2, 1e-3, 1e-4],
    "loss_function": ["BCELoss", "FocalLoss"],
    "weighting_strategy": ["none", "balanced", "sqrt-balanced"],
    "batch_size": [1024, 2048],
    "epochs": [10, 20]
}

In [22]:
# Генерация всех конфигураций
keys, values = zip(*grid_config.items())
all_reduced_configs = [dict(zip(keys, v)) for v in itertools.product(*values)]

In [23]:
# Random выбор 80 конфигураций
random.seed(42)
random_sample = random.sample(all_reduced_configs, 108) # 108 из 432 = 25% кейсов

In [24]:
import torch.nn.functional as F

In [25]:
# Создаем адаптированную функцию обучения под кастомную конфигурацию
def create_train_model_fn(pos_weight, neg_weight, device):
    def train_model(config):
        # Формирование DataLoader с нужным batch_size
        train_loader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=config["batch_size"])

        # Инициализация модели CLIPCosineModel с init_scale
        class CLIPCosineModel(nn.Module):
            def __init__(self, init_scale):
                super().__init__()
                self.scale = nn.Parameter(torch.tensor(init_scale))

            def forward(self, user_vec, item_vec):
                user_norm = F.normalize(user_vec, p=2, dim=1)
                item_norm = F.normalize(item_vec, p=2, dim=1)
                cosine = (user_norm * item_norm).sum(dim=1, keepdim=True)
                return torch.sigmoid(self.scale * cosine)

        model = CLIPCosineModel(config["init_scale"]).to(device)

        # Выбор функции потерь
        if config["loss_function"] == "BCELoss":
            criterion = nn.BCELoss(reduction="none")
        elif config["loss_function"] == "BCEWithLogitsLoss":
            criterion = nn.BCEWithLogitsLoss(reduction="none")
        elif config["loss_function"] == "FocalLoss":
            def focal_loss(preds, labels, gamma=2.0):
                eps = 1e-6
                p_t = preds * labels + (1 - preds) * (1 - labels)
                loss = -((1 - p_t) ** gamma) * torch.log(p_t + eps)
                return loss
            criterion = focal_loss

        # Выбор оптимайзера
        if config["optimizer"] == "Adam":
            optimizer = torch.optim.Adam(model.parameters(), lr=config["learning_rate"])
        elif config["optimizer"] == "AdamW":
            optimizer = torch.optim.AdamW(model.parameters(), lr=config["learning_rate"])
        elif config["optimizer"] == "SGD":
            optimizer = torch.optim.SGD(model.parameters(), lr=config["learning_rate"])

        # Обучение
        for epoch in range(config["epochs"]):
            model.train()
            train_losses = []
            for u_vec, i_vec, labels in train_loader:
                u_vec, i_vec, labels = u_vec.to(device), i_vec.to(device), labels.to(device).view(-1, 1)
                preds = model(u_vec, i_vec)

                if config["loss_function"] == "FocalLoss":
                    loss_raw = criterion(preds, labels)
                else:
                    loss_raw = criterion(preds, labels)

                if config["weighting_strategy"] == "none":
                    weights = torch.ones_like(labels)
                elif config["weighting_strategy"] == "balanced":
                    weights = torch.where(labels == 1, pos_weight, neg_weight).to(device)
                elif config["weighting_strategy"] == "sqrt-balanced":
                    weights = torch.where(
                        labels == 1,
                        torch.sqrt(pos_weight),
                        torch.sqrt(neg_weight)
                    ).to(device)

                loss = (loss_raw * weights).mean()
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                train_losses.append(loss.item())

            # Валидация
            model.eval()
            val_losses, all_preds, all_targets = [], [], []
            with torch.no_grad():
                for u_vec, i_vec, labels in val_loader:
                    u_vec, i_vec, labels = u_vec.to(device), i_vec.to(device), labels.to(device).view(-1, 1)
                    preds = model(u_vec, i_vec)

                    if config["loss_function"] == "FocalLoss":
                        loss_raw = criterion(preds, labels)
                    else:
                        loss_raw = criterion(preds, labels)

                    if config["weighting_strategy"] == "none":
                        weights = torch.ones_like(labels)
                    elif config["weighting_strategy"] == "balanced":
                        weights = torch.where(labels == 1, pos_weight, neg_weight).to(device)
                    elif config["weighting_strategy"] == "sqrt-balanced":
                        weights = torch.where(
                            labels == 1,
                            torch.sqrt(pos_weight),
                            torch.sqrt(neg_weight)
                        ).to(device)

                    loss = (loss_raw * weights).mean()
                    val_losses.append(loss.item())
                    all_preds.extend(preds.cpu().numpy())
                    all_targets.extend(labels.cpu().numpy())

            # all_preds_bin = (np.array(all_preds) >= 0.5).astype(int)
            # val_acc = accuracy_score(all_targets, all_preds_bin)
            # tqdm.write(f"Epoch {epoch + 1}/{config['epochs']} | "
            #            f"Train Loss: {np.mean(train_losses):.4f} | "
            #            f"Val Loss: {np.mean(val_losses):.4f} | "
            #            f"Val Accuracy: {val_acc:.4f}")

        return model

    return train_model


In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [27]:
train_model_fn = create_train_model_fn(
    pos_weight=torch.tensor(pos_weight, dtype=torch.float32),
    neg_weight=torch.tensor(neg_weight, dtype=torch.float32),
    device=device
)

In [28]:
model_list = []

In [29]:
for config in tqdm(random_sample, desc="Training models"):
    model = train_model_fn(config)
    model.eval()
    model_list.append((model, config, {}))

Training models: 100%|██████████| 108/108 [9:31:18<00:00, 317.40s/it] 


In [30]:
# Подготавливаем список кандидатов и их эмбеддинги для инференса
candidate_items = item_vectors.index.tolist()
candidate_vectors = item_vectors.values

In [31]:
# Формирование топов рекомендаций для каждого юзера
for user_id in tqdm(ground_truth.keys(), desc="Top-N generation", file=sys.stdout):
    if user_id not in user_vectors:
        continue
    user_vec = user_vectors[user_id]
    user_vec_batch = np.tile(user_vec, (candidate_vectors.shape[0], 1))
    user_tensor = torch.tensor(user_vec_batch, dtype=torch.float32).to(device)
    item_tensor = torch.tensor(candidate_vectors, dtype=torch.float32).to(device)

    for idx, (model, config, predictions_dict) in enumerate(model_list):
        with torch.no_grad():
            scores = model(user_tensor, item_tensor).cpu().numpy().flatten()
        sorted_items = np.array(candidate_items)[np.argsort(scores)[::-1]]
        predictions_dict[user_id] = sorted_items[:10].tolist()
        model_list[idx] = (model, config, predictions_dict)  # обновляем словарь

Top-N generation: 100%|██████████| 85083/85083 [54:10:38<00:00,  2.29s/it]   


In [32]:
# Храним все метрики
metrics_list = []

In [33]:
for idx, (model, config, predictions_dict) in enumerate(model_list):
    precision = precision_at_k(predictions_dict, ground_truth)
    map_score = map_at_k(predictions_dict, ground_truth)
    metrics_list.append({
        "index": idx,
        "precision@10": precision,
        "map@10": map_score,
        "config": config
    })

In [34]:
# Сортировка по Precision и MAP
best_by_precision = max(metrics_list, key=lambda x: x["precision@10"])
best_by_map = max(metrics_list, key=lambda x: x["map@10"])

In [35]:
# Вывод
print("Best Model by Precision@10:")
print(f"Precision@10: {best_by_precision['precision@10']}")
print(f"MAP@10:       {best_by_precision['map@10']}")
print("Config:")
print(best_by_precision["config"])

if best_by_precision["index"] != best_by_map["index"]:
    print("\nBest Model by MAP@10:")
    print(f"Precision@10: {best_by_map['precision@10']}")
    print(f"MAP@10:       {best_by_map['map@10']}")
    print("Config:")
    print(best_by_map["config"])

Best Model by Precision@10:
Precision@10: 0.0032
MAP@10:       0.008
Config:
{'init_scale': 2.0, 'optimizer': 'AdamW', 'learning_rate': 0.001, 'loss_function': 'FocalLoss', 'weighting_strategy': 'none', 'batch_size': 2048, 'epochs': 20}

Best Model by MAP@10:
Precision@10: 0.0032
MAP@10:       0.0081
Config:
{'init_scale': 2.0, 'optimizer': 'SGD', 'learning_rate': 0.01, 'loss_function': 'FocalLoss', 'weighting_strategy': 'balanced', 'batch_size': 1024, 'epochs': 20}
