In [None]:
####### INSTALATION #######
pip uninstall torch -y
pip install torch==1.13.1
!pip uninstall torch-scatter -y
!pip uninstall torch-sparse -y
!pip uninstall pyg-lib -y
!pip uninstall git+https://github.com/pyg-team/pytorch_geometric.git -y

import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install pyg-lib -f https://data.pyg.org/whl/nightly/torch-${TORCH}.html
!pip install git+https://github.com/pyg-team/pytorch_geometric.git

In [2]:
###### IMPORT #######
import numpy as np
import pandas as pd
import random
# from neo4j import GraphDatabase
from torch_geometric.data import Data
import torch
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
from tqdm import tqdm
from collections import defaultdict
import json
import multiprocessing
import matplotlib.pyplot as plt

from sklearn import preprocessing, feature_extraction, model_selection
from sklearn.metrics import mean_absolute_error, mean_squared_error
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
from torch_geometric.loader import LinkNeighborLoader
from torch_geometric.nn import SAGEConv, to_hetero
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score

In [None]:
#### DATA LOADER ####
from torch_geometric.data import download_url, extract_zip
from torch import Tensor

def data_loader(ratings_df):
    # Create a mapping from unique user indices to range [0, num_user_nodes):
    unique_user_id = ratings_df['userId'].unique()
    unique_user_id = pd.DataFrame(data={
        'userId': unique_user_id,
        'mappedID': pd.RangeIndex(len(unique_user_id)),
    })
    print("Mapping of user IDs to consecutive values:")
    print("==========================================")
    print(unique_user_id.head())
    print()
    # Create a mapping from unique movie indices to range [0, num_movie_nodes):
    unique_item_id = ratings_df['itemId'].unique()
    unique_item_id = pd.DataFrame(data={
        'itemId': unique_item_id,
        'mappedID': pd.RangeIndex(len(unique_item_id)),
    })
    print("Mapping of item IDs to consecutive values:")
    print("===========================================")
    print(unique_item_id.head())

    ratings_user_id = pd.merge(ratings_df['userId'], unique_user_id,
                                left_on='userId', right_on='userId', how='left')
    ratings_user_id = torch.from_numpy(ratings_user_id['mappedID'].values)
    ratings_item_id = pd.merge(ratings_df['itemId'], unique_item_id,
                                left_on='itemId', right_on='itemId', how='left')
    ratings_item_id = torch.from_numpy(ratings_item_id['mappedID'].values)
    # With this, we are ready to construct our `edge_index` in COO format
    # following PyG semantics:
    edge_index_user_to_item = torch.stack([ratings_user_id, ratings_item_id], dim=0)
    # assert edge_index_user_to_item.size() == (2, 100836)
    print()
    print("Final edge indices pointing from users to items:")
    print("=================================================")
    print(edge_index_user_to_item)
    return unique_user_id, unique_item_id, edge_index_user_to_item

def movie_loader():
    url = 'https://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
    extract_zip(download_url(url, '.'), '.')
    movies_path = './ml-latest-small/movies.csv'
    ratings_path = './ml-latest-small/ratings.csv'
    items_ratings_df = pd.read_csv(ratings_path)
    items_ratings_df = items_ratings_df.rename(columns={'movieId': 'itemId'})
    unique_user_id, unique_item_id, edge_index_user_to_item = data_loader(items_ratings_df)
    items_df = pd.read_csv(movies_path)
    items_df = items_df.rename(columns={'movieId': 'itemId', 'title': 'name'})
    items_df = pd.merge(items_df, unique_item_id, on='itemId', how='left')
    items_df = items_df.sort_values('mappedID') # (Just the last 20 movies have NaN mappedId)
    genres = items_df['genres'].str.get_dummies('|')
    print(genres[["Action", "Adventure", "Drama", "Horror"]].head())
    item_feat = torch.from_numpy(genres.values).to(torch.float)
    assert item_feat.size() == (9742, 20)  # 20 genres in total.
    return unique_user_id, unique_item_id, edge_index_user_to_item, items_df, item_feat

def contract_loader():
    items_ratings_df = pd.read_parquet('dataset/user_contract_rating.parquet')
    items_ratings_df = items_ratings_df[:100000]
    items_df = {}
    items_df['name'] = items_ratings_df['item'].unique()
    items_df['itemId'], unique_names = pd.factorize(items_df['name'])
    items_df['itemId'] = items_df['itemId'] + 1
    items_df = pd.DataFrame(items_df, columns=['itemId', 'name'])

    contract_top_words_df = pd.read_parquet('dataset/contract_top_words.parquet')
    contract_top_words_df = contract_top_words_df.rename(columns={'contract_name': 'name'})
    contracts_df_top_words = items_df.merge(contract_top_words_df, on='name', how='left')
    contracts_df_top_words['keywords'] = contracts_df_top_words['keywords'].fillna('')
    items_df = contracts_df_top_words
    items_df.set_index('itemId', inplace=True)
    # f =5
    items_df['truncated_keywords'] = items_df['keywords'].apply(lambda x: ','.join(x.split(',')))
    X_df = items_df['truncated_keywords'].str.get_dummies(',')
    item_feat = torch.from_numpy(X_df.values).to(torch.float)
    print(item_feat.shape)
    print(item_feat)
    print(items_df)
    print(X_df)
    items_ratings_df = items_ratings_df.rename(columns={'user': 'userId', 'item': 'itemId'})
    unique_user_id, unique_item_id, edge_index_user_to_item = data_loader(items_ratings_df)
    return unique_user_id, unique_item_id, edge_index_user_to_item, items_df, item_feat

universal_mode = 'contract'
loaders = {
    'contract_loader': contract_loader,
    'movie_loader': movie_loader,
}
unique_user_id, unique_item_id, edge_index_user_to_item, items_df, item_feat = loaders[f'{universal_mode}_loader']()


In [4]:
######### LINK BINARY PRED MODEL ##########
def train_test_generator(unique_user_id, items_df, item_feat, edge_index_user_to_item):  
    data = HeteroData()
    data["user"].node_id = torch.arange(len(unique_user_id))
    data["item"].node_id = torch.arange(len(items_df))
    data["item"].x = item_feat
    data["user", "rates", "item"].edge_index = edge_index_user_to_item
    data = T.ToUndirected()(data)

    transform = T.RandomLinkSplit(
        num_val=0,
        num_test=0.2,
        disjoint_train_ratio=0.3,
        neg_sampling_ratio=0,
        add_negative_train_samples=False,
        edge_types=("user", "rates", "item"),
        rev_edge_types=("item", "rev_rates", "user"), 
    )
    
    train_data, val_data, test_data = transform(data)
    return data, train_data, test_data

def GNN_recommender(data, train_data):

    # Define seed edges:
    edge_label_index = train_data["user", "rates", "item"].edge_label_index
    edge_label = train_data["user", "rates", "item"].edge_label
    train_loader = LinkNeighborLoader(
        data=train_data,
        num_neighbors=[20, 10],
        neg_sampling_ratio=2.0,
        edge_label_index=(("user", "rates", "item"), edge_label_index),
        edge_label=edge_label,
        batch_size=128,
        shuffle=True,
    )

    class GNN(torch.nn.Module):
        def __init__(self, hidden_channels):
            super().__init__()
            self.conv1 = SAGEConv(hidden_channels, hidden_channels)
            self.conv2 = SAGEConv(hidden_channels, hidden_channels)
        def forward(self, x: Tensor, edge_index: Tensor) -> Tensor:
            x = F.relu(self.conv1(x, edge_index))
            x = self.conv2(x, edge_index)
            return x
    # Our final classifier applies the dot-product between source and destination
    # node embeddings to derive edge-level predictions:
    class Classifier(torch.nn.Module):
        def forward(self, x_user: Tensor, x_item: Tensor, edge_label_index: Tensor) -> Tensor:
            edge_feat_user = x_user[edge_label_index[0]] # Convert node embeddings to edge-level representations:
            edge_feat_item = x_item[edge_label_index[1]]
            scores = (edge_feat_user * edge_feat_item).sum(dim=-1)
            return scores # Apply dot-product to get a prediction per supervision edge:
        
    class Model(torch.nn.Module):
        def __init__(self, hidden_channels):
            super().__init__()
            # Since the dataset does not come with rich features, we also learn two
            # embedding matrices for users and items:
            self.item_lin = torch.nn.Linear(item_feat.shape[1], hidden_channels)
            self.user_emb = torch.nn.Embedding(data["user"].num_nodes, hidden_channels)
            self.item_emb = torch.nn.Embedding(data["item"].num_nodes, hidden_channels)
            # Instantiate homogeneous GNN:
            self.gnn = GNN(hidden_channels)
            # Convert GNN model into a heterogeneous variant:
            self.gnn = to_hetero(self.gnn, metadata=data.metadata())
            self.classifier = Classifier()

        def forward(self, data: HeteroData) -> Tensor:
            x_dict = {
            "user": self.user_emb(data["user"].node_id),
            "item": self.item_lin(data["item"].x) + self.item_emb(data["item"].node_id),
            } 
            # `x_dict` holds feature matrices of all node types
            # `edge_index_dict` holds all edge indices of all edge types
            x_dict = self.gnn(x_dict, data.edge_index_dict)
            pred = self.classifier(
                x_dict["user"],
                x_dict["item"],
                data["user", "rates", "item"].edge_label_index,
            )
            return pred
            
    ########## TRAINING ##########
    model = Model(hidden_channels=64)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Device: '{device}'")
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    for epoch in range(1, 10):
        total_loss = total_examples = 0
        for sampled_data in tqdm(train_loader):
            optimizer.zero_grad()
            sampled_data.to(device)
            pred = model(sampled_data)
            ground_truth = sampled_data["user", "rates", "item"].edge_label
            loss = F.binary_cross_entropy_with_logits(pred, ground_truth)
            loss.backward()
            optimizer.step()
            total_loss += float(loss) * pred.numel()
            total_examples += pred.numel()

        # TODO: Add the val_loader, keep the best model
        print(f"Epoch: {epoch:03d}, Loss: {total_loss / total_examples:.4f}")

    ########## AUC EVAL VALIDATION #########
    # edge_label_index = val_data["user", "rates", "item"].edge_label_index
    # edge_label = val_data["user", "rates", "item"].edge_label
    # # val_data has neg samples in it
    # val_loader = LinkNeighborLoader(
    #     data=val_data,
    #     num_neighbors=[20, 10],
    #     edge_label_index=(("user", "rates", "item"), edge_label_index),
    #     edge_label=edge_label,
    #     batch_size=3 * 128,
    #     shuffle=False,
    # )
    # sampled_data = next(iter(val_loader))
    # preds = []
    # ground_truths = []
    # for sampled_data in tqdm(val_loader):
    #     with torch.no_grad():
    #         sampled_data.to(device)
    #         preds.append(model(sampled_data))
    #         ground_truths.append(sampled_data["user", "rates", "item"].edge_label)
    # pred = torch.cat(preds, dim=0).cpu().numpy()
    # ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()
    # auc = roc_auc_score(ground_truth, pred)
    # print()
    # print(f"Validation AUC: {auc:.4f}")
    # return data, train_data, val_data, train_loader, val_loader, ground_truth, pred, test_data, model
    return model

In [None]:
########## TRAIN TEST GENERAION ############

### ABLATION EXPRIMENT ### uncomment below for hiding item_features
# item_feat = torch.zeros_like(item_feat)

data, train_data, test_data = train_test_generator(unique_user_id, items_df, item_feat, edge_index_user_to_item)

In [7]:
######## ALL_TO_ALL USER_ITEM PAIRS GENERATOR IN TEST_DATA #########

# If mode GNN run below
### SLICING TEST_DATA FOR ALL_TO_ALL EVAL ###
slice_rate = 0.4
test_data["user", "rates", "item"].edge_label_index = test_data["user", "rates", "item"].edge_label_index[:, : int(slice_rate * len(test_data["user", "rates", "item"].edge_label_index[0]))]
test_data["user", "rates", "item"].edge_label = test_data["user", "rates", "item"].edge_label[ : int(slice_rate * len(test_data["user", "rates", "item"].edge_label))]

edge_index_test = set(zip(test_data["user", "rates", "item"].edge_label_index[0].numpy(), test_data["user", "rates", "item"].edge_label_index[1].numpy()))

all_users = test_data["user", "rates", "item"].edge_label_index[0].unique().numpy()
all_items = test_data["user", "rates", "item"].edge_label_index[1].unique().numpy()

new_edges = []
new_labels = []

for user_id in all_users:
    for item_id in all_items:
        if (user_id, item_id) not in edge_index_test:
            new_edges.append((user_id, item_id))
            new_labels.append(0)

import copy
test_data_prime = copy.deepcopy(test_data)

if new_edges:
    new_edges_tensor = torch.tensor(new_edges, dtype=torch.int64).t().contiguous()
    new_labels_tensor = torch.tensor(new_labels, dtype=torch.int64)

    test_data_prime["user", "rates", "item"].edge_label_index = torch.cat((test_data_prime["user", "rates", "item"].edge_label_index, new_edges_tensor), dim=1)
    test_data_prime["user", "rates", "item"].edge_label = torch.cat((test_data_prime["user", "rates", "item"].edge_label, new_labels_tensor), dim=0)

print('test edges shape BEFORE adding all possible user item pairs', test_data["user", "rates", "item"].edge_label_index.shape)
print('test edges shape AFTER adding all possible user item pairs', test_data_prime["user", "rates", "item"].edge_label_index.shape)

print('unique users', len(test_data_prime["user", "rates", "item"].edge_label_index[0].unique()))
print('unique items', len(test_data_prime["user", "rates", "item"].edge_label_index[1].unique()))



test edges shape BEFORE adding all possible user item pairs torch.Size([2, 8000])
test edges shape AFTER adding all possible user item pairs torch.Size([2, 17552028])
unique users 4602
unique items 3814


In [None]:
####### EXPRIMENTS #######

#TODO: Check how many users/items are not in train set, if it is small, what we should do?

train_data_unique_users = set(train_data['user', 'rates', 'item'].edge_label_index[0].unique())
train_data_unique_items = set(train_data['user', 'rates', 'item'].edge_label_index[1].unique())
unique_users = set(unique_user_id['mappedID'].unique())
unique_items = set(unique_item_id['mappedID'].unique())
users_not_in_train = unique_users - train_data_unique_users
items_not_in_train = unique_items - train_data_unique_items
print(len(users_not_in_train))

mask_user = torch.tensor([user in users_not_in_train for user in test_data_prime["user", "rates", "item"].edge_label_index[0]])
mask_item = torch.tensor([item in items_not_in_train for item in test_data_prime["user", "rates", "item"].edge_label_index[1]])

test_data_prime_ucsp = copy.deepcopy(test_data_prime)
test_data_prime_icsp = copy.deepcopy(test_data_prime)
test_data_prime_ucsp = test_data_prime_ucsp["user", "rates", "item"].edge_label_index[:, mask_user]
test_data_prime_ucsp = test_data_prime_ucsp["user", "rates", "item"].edge_label[mask_user]

test_data_prime_ucsp = test_data_prime_icsp["user", "rates", "item"].edge_label_index[:, mask_item]
test_data_prime_ucsp = test_data_prime_icsp["user", "rates", "item"].edge_label[mask_item]

# Now just need to pass each test_ucsp or test_icsp during PRED GEN time



###### CONTRACTS MODEL ######
# data, train_data, val_data, train_loader, val_loader, ground_truth, pred, test_data, model = GNN_recommender(unique_user_id, items_df, contract_feat, edge_index_user_to_contract)
data, train_data, val_data, train_loader, test_data, model = GNN_recommender(unique_user_id, items_df, contract_feat, edge_index_user_to_contract)

# Ablation study of GNN
# ablation_without_contract_feature = torch.zeros_like(contract_feat)
# data, train_data, val_data, train_loader, val_loader, ground_truth, pred, test_data = GNN_recommender(unique_user_id, items_df, ablation_without_contract_feature, edge_index_user_to_contract)



# Cold Start User: Just need to change the test_df

# Cold Start Item

# Diversity

# Contract Representation (effect of f top-keywords in contract_feat)

# Sparsity (keep users just with > h interactions)




In [5]:
########## GNN TRAINING ############
#if model_mode == GNN run below
model = GNN_recommender(data, train_data)


Device: 'cuda'


100%|██████████| 188/188 [00:30<00:00,  6.25it/s]


Epoch: 001, Loss: 0.4480


100%|██████████| 188/188 [00:28<00:00,  6.64it/s]


Epoch: 002, Loss: 0.3302


100%|██████████| 188/188 [00:28<00:00,  6.58it/s]


Epoch: 003, Loss: 0.2777


100%|██████████| 188/188 [00:28<00:00,  6.56it/s]


Epoch: 004, Loss: 0.2516


100%|██████████| 188/188 [00:28<00:00,  6.64it/s]


Epoch: 005, Loss: 0.2255


100%|██████████| 188/188 [00:28<00:00,  6.60it/s]


Epoch: 006, Loss: 0.2089


100%|██████████| 188/188 [00:28<00:00,  6.49it/s]


Epoch: 007, Loss: 0.1890


100%|██████████| 188/188 [00:28<00:00,  6.54it/s]


Epoch: 008, Loss: 0.1720


100%|██████████| 188/188 [00:28<00:00,  6.53it/s]

Epoch: 009, Loss: 0.1598





In [8]:
######## GNN PRED FOR TEST_DATA_PRIME #########
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
test_loader = LinkNeighborLoader(
    data=test_data_prime,
    num_neighbors=[20, 10],
    edge_label_index=(("user", "rates", "item"), test_data_prime["user", "rates", "item"].edge_label_index),
    edge_label=test_data_prime["user", "rates", "item"].edge_label,
    batch_size=3 * 128,
    shuffle=False,
)
sampled_data = next(iter(test_loader))
preds = []
ground_truths = []
for sampled_data in tqdm(test_loader):
    with torch.no_grad():
        sampled_data.to(device)
        preds.append(model(sampled_data))
        ground_truths.append(sampled_data["user", "rates", "item"].edge_label)
pred_gnn = torch.cat(preds, dim=0).cpu().numpy()
ground_truth_gnn = torch.cat(ground_truths, dim=0).cpu().numpy()

print('all ground truth len', len(ground_truth_gnn))

100%|██████████| 45709/45709 [1:41:04<00:00,  7.54it/s]  


all ground truth len 17552028


In [12]:
########### DATA PREPRATION FOR MF MODELS #############

# if model_mode == 'MF':
test_df_index = test_data_prime['user', 'rates', 'item'].edge_label_index.numpy()
test_df_label = test_data_prime['user', 'rates', 'item'].edge_label.numpy()
test_df_index = test_df_index.T 
test_df = pd.DataFrame(test_df_index, columns=['user', 'item'])
test_df['rating'] = test_df_label

train_df_index = train_data['user', 'rates', 'item'].edge_label_index.numpy()
train_df_label = train_data['user', 'rates', 'item'].edge_label.numpy()
train_df_index = train_df_index.T 
train_df = pd.DataFrame(train_df_index, columns=['user', 'item'])
train_df['rating'] = train_df_label


contract_to_topic_df = pd.read_parquet("dataset/contract_name_topic.parquet")

# def add_topic(df):
#     df['topic'] = ''
#     for i, item in tqdm(df.iterrows(), total=len(df)):
#         item_name = unique_item_id[unique_item_id['mappedID'] == item['item']]['itemId']
#         topic = contract_to_topic_df[contract_to_topic_df['contract_name'] == item_name.item()]['most_probable_topic']
        
#         if topic.shape[0] == 0:  # No matches
#             df.at[i, 'topic'] = 0
#         elif topic.shape[0] > 1:  # Multiple matches
#             df.at[i, 'topic'] = topic.iloc[0].item()
#         else:  # Exactly one match
#             df.at[i, 'topic'] = topic.item()
    
#     return df

# test_df = add_topic(test_df)
# train_df = add_topic(train_df)

In [13]:
######## NAME LEVEL MF TRAIN & PRED #########
from lightfm import LightFM
from lightfm.data import Dataset

dataset = Dataset()
dataset.fit(data['user'].node_id.numpy(), data['item'].node_id.numpy())
user_ids_mapping, _, item_ids_mapping, _ = dataset.mapping()

(train_interactions, train_interactions_weight) = dataset.build_interactions((row['user'], row['item'], row['rating']) for index, row in train_df.iterrows())

model = LightFM(loss='warp')
model.fit(train_interactions, epochs=30, num_threads=2, sample_weight=train_interactions_weight)

test_df['pred_nmf'] = 0

for user, user_data in tqdm(test_df.groupby('user'), total=test_df['user'].nunique()):
    user_id_internal = user_ids_mapping[user]
    item_ids_internal = np.array([item_ids_mapping[item] for item in user_data['item']])
    predictions = model.predict(user_id_internal, item_ids_internal)
    test_df.loc[user_data.index, 'pred_nmf'] = predictions

pred_nmf = test_df['pred_nmf'].to_numpy()
ground_truth_mf = test_df['rating'].to_numpy()

100%|██████████| 4602/4602 [00:31<00:00, 146.77it/s]


In [None]:
######## CONTRACT LEVEL MF TRAIN & PRED #########
from lightfm import LightFM
from lightfm.data import Dataset
dataset = Dataset()
dataset.fit(data['user'].node_id.numpy(), np.arange(15)) # since we have 0 to 14 topics
user_ids_mapping, _, item_ids_mapping, _ = dataset.mapping()

(train_interactions, train_interactions_weight) = dataset.build_interactions((row['user'], row['topic'], row['rating']) for index, row in train_df.iterrows())

model = LightFM(loss='warp')
model.fit(train_interactions, epochs=30, num_threads=2, sample_weight=train_interactions_weight)

def topic_popular_contracts(df):
    item_rating_sum = df.groupby(['topic', 'item'])['rating'].sum().reset_index()
    sorted_items = item_rating_sum.sort_values(['topic', 'rating'], ascending=[True, False])
    topic_to_popular_items = {k: g['item'].tolist() for k, g in sorted_items.groupby('topic')}
    return topic_to_popular_items

test_df['pred_cmf'] = 0
topic_popular_contracts_dict = topic_popular_contracts(test_df)
for i, row in tqdm(test_df.iterrows(), total=len(test_df)):
    user_id_internal = user_ids_mapping[row['user']]
    # now topic_id is a prediction, we need to get the pred for all 14 topics, then sort it and return the topic (indices) with highest value
    topic_pred = model.predict(user_id_internal, np.arange(15))
    topic_id = topic_pred.argsort()[::-1][0] # we can get f predicted topic instead of the highest one
    test_df['pred_cmf'][i] = topic_popular_contracts_dict[topic_id][0] # we are getting first popular contract in predicted topic


pred_cmf = test_df['pred_cmf'].to_numpy()

In [14]:
######### HIT@K EVAL V2 ##########
#Note: One simple way for faster hit@k is to calculate the hit@k just for the first 1k users for all models
def precision_at_k(user_id, edge_index, ground_truth, pred, k):

    mask = edge_index[0] == user_id
    filtered_pred = pred[mask]
    filtered_ground_truth = ground_truth[mask]
    sorted_indices = filtered_pred.argsort()[:: -1]

    hit = 0
    for i in sorted_indices[:k]:
        if filtered_ground_truth[i] > 0: hit+= 1

    return hit/k


def ap_at_k(k, precision_at_k, mode):
    precisions = []
    edge_index = test_data_prime['user', 'rates', 'item'].edge_label_index
    user_ids = edge_index[0].unique()
    count = 0
    for user_id in tqdm(user_ids, total=len(user_ids)):
        if mode == 'nmf':
            precisions.append(precision_at_k(user_id, edge_index, ground_truth_mf, pred_nmf, k)) # ground_truth is the same for both GNN and mf
        if mode == 'cmf':
            precisions.append(precision_at_k(user_id, edge_index, ground_truth_mf, pred_cmf, k))
        else:
            precisions.append(precision_at_k(user_id, edge_index, ground_truth_gnn, pred_gnn, k))
            count += 1
            # if count % 100 == 0: 
            #     print(np.mean(precisions))

    return np.mean(precisions)


k_values = [1, 5]
for k in k_values:
    hit_at_k = ap_at_k(k, precision_at_k, mode='nmf')
    print(f"AP@{k}:", hit_at_k)


100%|██████████| 4602/4602 [05:22<00:00, 14.28it/s]


AP@1: 0.04758800521512386


100%|██████████| 4602/4602 [05:22<00:00, 14.27it/s]

AP@5: 0.03489787049109083





' NMF MovieLens\nAP@1: 0.07635467980295567\nAP@5: 0.06962233169129721\n'

In [None]:
######### HIT@K EVAL V1 ##########
# in val_data len(edge_index) = 80670, but len(edge_label_index) = 30249, we selected edge_label_index since for train_loader used the same
def precision_at_k(user_id, edge_index, ground_truth, pred, k):

    mask = edge_index[0] == user_id
    filtered_pred = pred[mask]
    filtered_ground_truth = ground_truth[mask]
    sorted_indices = filtered_pred.argsort()[:: -1]

    top_k = [(filtered_ground_truth[i], filtered_pred[i]) for i in sorted_indices[:k]]
    hit = 0
    for i in range(len(top_k)):
        ground_truth, pred = top_k[i]
        if ground_truth > 0 and pred > 0: # I think we should remove this: and pred > 0:
            hit += 1
    precision = hit / k

    return precision


def ap_at_k(k, precision_at_k, mode):
    precisions = []
    edge_index = val_loader.data['user', 'rates', 'item'].edge_label_index
    for user_id in tqdm(edge_index[0], total=len(edge_index[0])):
        if mode == 'nmf':
            precisions.append(precision_at_k(user_id, edge_index, ground_truth, pred_nmf, k)) # ground_truth is the same for both GNN and mf
        if mode == 'cmf':
            precisions.append(precision_at_k(user_id, edge_index, ground_truth, pred_cmf, k))
        else:
            precisions.append(precision_at_k(user_id, edge_index, ground_truth, pred, k))
            break

    return np.mean(precisions)


k_values = [1, 2, 3, 4, 5]
for k in k_values:
    hit_at_k = ap_at_k(k, precision_at_k, mode='GNN')
    print(f"AP@{k}:", hit_at_k)

