In [1]:

from typing import List
import numpy as np
from collections import namedtuple
import pandas as pd

import torch
from torchmetrics.classification import BinaryAUROC, BinaryPrecision, BinaryRecall, BinaryF1Score

from helper_modules.metrics import ClassificationMetric

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset


# Example Model
class MatrixFactorization(nn.Module):
    def __init__(self, model_size, num_users, num_items):
        super().__init__()
        self.user_embedding = torch.nn.Embedding(num_users + 1, model_size, padding_idx=0)
        self.item_embedding = torch.nn.Embedding(num_items + 1, model_size, padding_idx=0)
        self.sigmoid = torch.nn.Sigmoid()
        self.user_bias = torch.nn.Embedding(num_users + 1, 1, padding_idx=0)
        self.item_bias = torch.nn.Embedding(num_items + 1, 1, padding_idx=0)

    def forward(self, user, pos_item, neg_item):
        user_emb = self.user_embedding(user)
        pos_item_emb = self.item_embedding(pos_item)
        neg_item_emb = self.item_embedding(neg_item)
        user_bias = self.user_bias(user).squeeze()
        pos_item_bias = self.item_bias(pos_item).squeeze()
        neg_item_bias = self.item_bias(neg_item).squeeze()
        return user_emb, pos_item_emb, neg_item_emb, user_bias, pos_item_bias, neg_item_bias

# BPR Loss Function
def bpr_loss(user_emb, pos_item_emb, neg_item_emb, user_bias, pos_item_bias, neg_item_bias):
    pos_scores = (user_emb * pos_item_emb).sum(dim=1) + user_bias + pos_item_bias  # Dot product for positive items
    neg_scores = (user_emb * neg_item_emb).sum(dim=1) + user_bias + neg_item_bias  # Dot product for negative items
    loss = -F.logsigmoid(pos_scores - neg_scores).mean()
    return loss



class ModelTrainer:
    def __init__(self, model: torch.nn.Module, device: str, metrics: List[str], k: int = 5):
        self.model = model.to(device)
        self.device = device
        self._val_losses = []
        self.k = k
    
    def train_model(self, optimizer, loss_function, train_dataloader, valid_dataloader, num_epochs):
        for epoch in range(num_epochs):
            self.model.train()
            train_loss = 0
            for user, pos_item, neg_item in train_dataloader:
                user = user.to(self.device)
                pos_item = pos_item.to(self.device)
                neg_item = neg_item.to(self.device)

                # Forward pass
                user_emb, pos_item_emb, neg_item_emb, user_bias, pos_item_bias, neg_item_bias = self.model(user, pos_item, neg_item)

                # Compute BPR Loss
                loss = bpr_loss(user_emb, pos_item_emb, neg_item_emb, user_bias, pos_item_bias, neg_item_bias)

                # Backward pass and optimization
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                train_loss += loss.item()
            print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss/len(train_dataloader):.4f}", end=' ')

            ndcg_5, ndcg_10 = self.validate(valid_dataloader, k=5)
            print(f"NDCG at 5 {ndcg_5:.4f}, NDCG at 10 {ndcg_10:.4f}")
            if self._is_early_stop(ndcg_5):
                print(f"Validation loss stopped improving. Aborting")
                break
        print(f"Number of parameters trained {self._get_n_params()}")
        return ndcg_5
    
    def _get_n_params(self) -> int:
        n_params = sum(param.numel() for param in self.model.parameters() if param.requires_grad)
        return n_params
    
    def _is_early_stop(self, current_loss: float) -> bool:
        if len(self._val_losses) <= 2:
            self._val_losses.append(current_loss)
        else:
            if current_loss < self._val_losses[-1] < self._val_losses[-2]:
                return True
            self._val_losses.append(current_loss)
        return False
        
    @staticmethod
    def dcg_torch(relevance_scores, k):
        """
        Calculate Discounted Cumulative Gain (DCG) at K.
        """
        relevance_scores = relevance_scores[:k]
        gains = torch.pow(2, relevance_scores) - 1
        discounts = torch.log2(torch.arange(1, len(relevance_scores) + 1, dtype=torch.float32) + 1)
        return torch.sum(gains / discounts)

    @staticmethod
    def ndcg_user_wise_torch(predicted, true_relevance, k):
        """
        Calculate NDCG@K for each user and average across users.
        """
        ndcg_scores = []

        for user in predicted.keys():
            # Sort true relevance based on predicted order
            user_predicted_scores = predicted[user]
            user_true_relevance = true_relevance[user]

            sorted_indices = torch.argsort(user_predicted_scores, descending=True)
            sorted_true_relevance = user_true_relevance[sorted_indices]

            # Calculate DCG@K
            dcg = ModelTrainer.dcg_torch(sorted_true_relevance, k)

            # Calculate IDCG@K (ideal ranking)
            ideal_relevance = torch.sort(user_true_relevance, descending=True).values
            idcg = ModelTrainer.dcg_torch(ideal_relevance, k)

            # Normalize
            ndcg = dcg / idcg if idcg > 0 else 0.0
            ndcg_scores.append(ndcg)
        # print(ndcg_scores)

        # Return mean NDCG across all users
        return torch.mean(torch.tensor(ndcg_scores))

    def validate(self, valid_dataloader, k):
        """
        Validate the model and calculate NDCG@K using the validation dataloader.
        """
        self.model.eval()  # Set model to evaluation mode
        predicted = {}
        true_relevance = {}

        with torch.no_grad():  # No gradients required for validation
            for user, pos_item, neg_item in valid_dataloader:
                user = user.to(self.device)
                pos_item = pos_item.to(self.device)
                neg_item = neg_item.to(self.device)

                # Forward pass
                user_emb, pos_item_emb, neg_item_emb, user_bias, pos_item_bias, neg_item_bias = self.model(user, pos_item, neg_item)
                x = torch.sum(user_emb * pos_item_emb, dim=-1)
                x = x + pos_item_bias + user_bias
                preds = self.model.sigmoid(x)

                # Store predictions and true relevance by user
                for uid, pred in zip(user, preds):
                    uid = uid.item()
                    if uid not in predicted:
                        predicted[uid] = []
                        true_relevance[uid] = []
                    predicted[uid].append(pred)
                    true_relevance[uid].append(1)
                
                x = torch.sum(user_emb * neg_item_emb, dim=-1)
                x = x + neg_item_bias + user_bias
                preds = self.model.sigmoid(x)

                # Store predictions and true relevance by user
                for uid, pred in zip(user, preds):
                    uid = uid.item()
                    if uid not in predicted:
                        predicted[uid] = []
                        true_relevance[uid] = []
                    predicted[uid].append(pred)
                    true_relevance[uid].append(0)

        # Convert lists to tensors for each user
        # df = pd.DataFrame()
        for user in predicted.keys():
            predicted[user] = torch.tensor(predicted[user])
            true_relevance[user] = torch.tensor(true_relevance[user])
            # dff = pd.DataFrame({'user': [user] * len(predicted[user]), 'pred': predicted[user], 'target': true_relevance[user]})
            # df = pd.concat([df, dff])
        # print()
        # print(df)
        # print(df['target'].value_counts())

        # Calculate NDCG@K
        mean_ndcg_5 = self.ndcg_user_wise_torch(predicted, true_relevance, 5)
        mean_ndcg_10 = self.ndcg_user_wise_torch(predicted, true_relevance, 10)
        return mean_ndcg_5, mean_ndcg_10


In [4]:
from data_selector import DatasetSelector 

dataset_selector = DatasetSelector()
train_dataloader, valid_dataloader, test_dataloader, features_stats = dataset_selector.get_data('steam', 64,
                                                5,
                                                5)

2024-12-11 13:27:41,188 - INFO - Loading data from data/steam_games/steam-200k.csv
2024-12-11 13:27:41,309 - INFO - Data loaded successfully with shape: (199293, 4)
2024-12-11 13:27:41,357 - INFO - Combined duplicates from 199293 to 199281
2024-12-11 13:27:41,377 - INFO - Dropped rare games from 5155 to 3047
2024-12-11 13:27:41,378 - INFO - Dropped inactive users from 12354 to 4797
2024-12-11 13:27:41,378 - INFO - Overall dropped dataset from 199281 to 177443
2024-12-11 13:27:41,439 - INFO - Added validation colums.  Train samples 151661 Validation samples 14391 Test samples 11391
2024-12-11 13:27:41,451 - INFO - Dropped play data: (115430, 6)
2024-12-11 13:27:41,476 - INFO - Encoded user, 4797 unique values
2024-12-11 13:27:41,479 - INFO - Saved user_encoding to data/steam_games/user_encoding.json
2024-12-11 13:27:41,491 - INFO - Encoded game, 3045 unique values
2024-12-11 13:27:41,493 - INFO - Saved game_encoding to data/steam_games/game_encoding.json
2024-12-11 13:27:41,535 - INFO -

N train interactions 89648
N valid interactions 14391
N test interactions 11391


In [5]:
model = MatrixFactorization(14, 4797, 3045)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

device = torch.device('cpu')
model_trainer = ModelTrainer(model, device, ['roc_auc', 'f1', 'precision', 'recall'])

ndcg_5 = model_trainer.train_model(optimizer, None, train_dataloader, valid_dataloader, 2)

Epoch [1/2], Train Loss: 1.4336 NDCG at 5 0.7821, NDCG at 10 0.8335
Epoch [2/2], Train Loss: 0.6010 NDCG at 5 0.8141, NDCG at 10 0.8589
Number of parameters trained 117660


In [6]:
model.eval()  # Set model to evaluation mode
predicted = {}
items_dict = {}
true_relevance = {}

with torch.no_grad():  # No gradients required for validation
    for _ in range(20):
        for user, pos_item, neg_item in test_dataloader:

            user_emb, pos_item_emb, neg_item_emb, user_bias, pos_item_bias, neg_item_bias = model(user, pos_item, neg_item)
            x = torch.sum(user_emb * pos_item_emb, dim=-1)
            x = x + pos_item_bias + user_bias
            preds = model.sigmoid(x)

            # Store predictions and true relevance by user
            for uid, item, pred in zip(user, pos_item, preds):
                uid = uid.item()
                if uid not in predicted:
                    predicted[uid] = []
                    true_relevance[uid] = []
                    items_dict[uid] = []
                predicted[uid].append(pred)
                true_relevance[uid].append(1)
                items_dict[uid].append(item.item())
            
            x = torch.sum(user_emb * neg_item_emb, dim=-1)
            x = x + neg_item_bias + user_bias
            preds = model.sigmoid(x)

            # Store predictions and true relevance by user
            for uid, item, pred in zip(user, neg_item, preds):
                uid = uid.item()
                if uid not in predicted:
                    predicted[uid] = []
                    true_relevance[uid] = []
                    items_dict[uid] = []
                predicted[uid].append(pred)
                true_relevance[uid].append(0)
                items_dict[uid].append(item.item())

# Convert lists to tensors for each user
df = pd.DataFrame()
for user in predicted.keys():
    predicted[user] = torch.tensor(predicted[user])
    true_relevance[user] = torch.tensor(true_relevance[user])
    dff = pd.DataFrame({'user': [user] * len(predicted[user]), 'game': items_dict[user], 'pred': predicted[user], 'target': true_relevance[user]})
    df = pd.concat([df, dff])
df = df.drop_duplicates().reset_index(drop=True)
# print()
# print(df)
# print(df['target'].value_counts())
mean_ndcg_5 = ModelTrainer.ndcg_user_wise_torch(predicted, true_relevance, 5)
mean_ndcg_10 = ModelTrainer.ndcg_user_wise_torch(predicted, true_relevance, 10)

In [25]:
def dcg_torch(relevance_scores, k):
    relevance_scores = relevance_scores[:k]
    gains = torch.pow(2, relevance_scores) - 1
    discounts = torch.log2(torch.arange(1, len(relevance_scores) + 1, dtype=torch.float32) + 1)
    return torch.sum(gains / discounts)

def ndcg_user_wise_torch(predicted, true_relevance, k):
    ndcg_scores = []
    for user in predicted.keys():
        user_predicted_scores = predicted[user]
        user_true_relevance = true_relevance[user]

        sorted_indices = torch.argsort(user_predicted_scores, descending=True)
        sorted_true_relevance = user_true_relevance[sorted_indices]

        dcg = ModelTrainer.dcg_torch(sorted_true_relevance, k)

        ideal_relevance = torch.sort(user_true_relevance, descending=True).values
        idcg = ModelTrainer.dcg_torch(ideal_relevance, k)

        ndcg = dcg / idcg if idcg > 0 else 0.0
        ndcg_scores.append(ndcg)
    return torch.tensor(ndcg_scores)

In [32]:
ndcg_user_wise_torch(predicted, true_relevance, 5)[-5:]

tensor([0.0000, 1.0000, 0.0000, 1.0000, 0.1312])

In [33]:
def get_dcg(relevance_scores, k):
    "Calculate Discounted Cumulative Gain (DCG) at K"

    relevance_scores = relevance_scores[:k]
    gains = np.power(2, relevance_scores) - 1
    discounts = np.log2(np.arange(1, len(relevance_scores) + 1) + 1)
    return np.sum(gains / discounts)

ndcg_scores = []
for user in list(predicted.keys()):
    user_predictions = np.array(predicted[user])
    user_labels = np.array(true_relevance[user])
    sorted_indices = np.argsort(user_predictions)[::-1]
    sorted_true_relevance = user_labels[sorted_indices]
    dcg = get_dcg(sorted_true_relevance, 5)
    ideal_relevance = np.sort(user_labels)[::-1]
    idcg = get_dcg(ideal_relevance, 5)

    ndcg = dcg / idcg if idcg > 0 else 0.0
    ndcg_scores.append(ndcg)

ndcg_scores[-5:]

[0.0, 1.0, 0.0, 1.0, 0.13120507751234178]

In [None]:
relevance_scores = relevance_scores[:k]
gains = np.power(2, relevance_scores) - 1
discounts = np.log2(np.arange(1, len(relevance_scores) + 1) + 1)
np.sum(gains / discounts)

In [21]:
mean_ndcg_5, mean_ndcg_10

(tensor(0.4977), tensor(0.6030))

In [19]:
df[df['user'] == 1]

Unnamed: 0,user,game,pred,target
115141,1,40,0.410790,1
115142,1,823,0.471690,0
115143,1,42,0.924542,1
115144,1,1799,0.803064,0
115145,1,41,0.994476,1
...,...,...,...,...
115198,1,62,0.999493,0
115199,1,97,0.999939,0
115200,1,900,0.862798,0
115201,1,1757,0.987149,0


In [46]:
u1 = []
for i in u:
    u1.extend([x.item() for x in i])

In [41]:
import pandas as pd

In [50]:
u1 = []
for i in u:
    u1.extend([x.item() for x in i])

g1 = []
for i in g:
    g1.extend([x.item() for x in i])

p1 = []
for i in p:
    p1.extend([x.item() for x in i])

l1 = []
for i in l:
    l1.extend([x.item() for x in i])

In [51]:
predictions = pd.DataFrame({'user': u1, 'game': g1, 'pred': p1, 'relevance': l1})
predictions

Unnamed: 0,user,game,pred,relevance
0,1211,94,0.230842,1.0
1,326,146,0.872687,0.0
2,180,18,0.975856,1.0
3,1929,1519,0.006788,0.0
4,1745,1581,0.823958,0.0
...,...,...,...,...
49744,617,693,0.013872,0.0
49745,422,535,0.001008,0.0
49746,508,575,0.177538,0.0
49747,397,727,0.967884,0.0


In [52]:
users = []
games = []
preds = []
targets = []
for batch, labels in valid_dataloader:
    (user_id, game_id, user_features, game_features) = batch
    out = model(batch)
    for user, game, pred, label in zip(user_id, game_id, out, labels):
        users.append(user.item())
        games.append(game.item())
        preds.append(pred.item())
        targets.append(label.item())
predictions = pd.DataFrame({'user': users, 'game': games, 'pred': preds, 'relevance': targets})

In [57]:
k = 10

predictions = predictions.sort_values(by=['user', 'pred'], ascending=[True, False])
    
top_k = predictions.groupby('user').head(k)

user_precisions = (
    top_k.groupby('user')['relevance']
    .apply(lambda x: x.sum() / k)
)
user_precisions.mean()

0.13522884882108183

In [58]:
user_precisions

user
0       0.2
1       0.1
2       0.3
3       0.0
4       0.1
       ... 
2158    0.2
2159    0.3
2160    0.1
2161    0.1
2162    0.1
Name: relevance, Length: 2163, dtype: float64

In [60]:
predictions[predictions['user'] == 3]

Unnamed: 0,user,game,pred,relevance
35029,3,592,0.989874,0.0
14120,3,302,0.943954,0.0
23017,3,218,0.94338,0.0
42163,3,2045,0.929699,0.0
33123,3,1250,0.927314,0.0
40520,3,39,0.921961,0.0
5931,3,379,0.919465,0.0
4237,3,1213,0.905511,0.0
16028,3,1496,0.884564,0.0
44556,3,928,0.88395,0.0


In [63]:
import numpy as np
a = np.asarray([3, 2, 2, 0])
gains = 2 ** a - 1
discounts = np.log2(np.arange(1, len(a) + 1) + 1)
gains

array([7, 3, 3, 0])

In [66]:
print(1, 3)

1 3


In [65]:
np.sum(gains / discounts)

10.392789260714373