In [None]:
####### INSTALATION #######

!pip uninstall torch -y
!pip install torch==1.13.1
# !pip uninstall torch-scatter -y
# !pip uninstall torch-sparse -y
# !pip uninstall pyg-lib -y
# !pip uninstall git+https://github.com/pyg-team/pytorch_geometric.git -y
# !pip uninstall sentence_transformers -y

import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip uninstall torch-scatter torch-sparse torch-geometric torch-cluster  --y
!pip install torch-scatter -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install pyg-lib -f https://data.pyg.org/whl/nightly/torch-${TORCH}.html
!pip install git+https://github.com/pyg-team/pytorch_geometric.git
!pip install pandas
!pip install matplotlib
!pip install pyarrow fastparquet
!pip install transformers
!pip install lightfm
!pip install memory-profiler
# !pip install torch-sparse -f https://data.pyg.org/whl/torch-{torch.__version__}.html
# !pip install torch-cluster -f https://data.pyg.org/whl/torch-{torch.__version__}.html
# !pip install sentence_transformers==0.1.0

In [1]:
###### IMPORT #######
import numpy as np
import time
import pandas as pd
import random
import copy
# from neo4j import GraphDatabase
from torch_geometric.data import Data
import torch
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
from tqdm import tqdm
from collections import defaultdict
import json
import multiprocessing
import matplotlib.pyplot as plt
from lightfm import LightFM
from lightfm.data import Dataset

from sklearn import preprocessing, feature_extraction, model_selection
from sklearn.metrics import mean_absolute_error, mean_squared_error
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
from torch_geometric.loader import LinkNeighborLoader
from torch_geometric.nn import SAGEConv, to_hetero
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score
import re
from transformers import AutoTokenizer, AutoModel
import os
import pickle
from memory_profiler import profile
# from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
########## SETUP ARGS ###########
len_interactions_to_consider = 100000
each_user_all2all_new_edges = 100 # when running sBERT, we have the embedding saved for 100k contracts
dataset_mode = 'contract'

possible_experiments = {
    0: 'all',
    1: 'diversity',
    2: 'ucsp',
    3: 'icsp',
    4: 'usparsity',
    5: 'isparsity',
    9: 'add_social_edges',
}
experiment = possible_experiments[0]
possible_modes = ['debug', 'experiment']
mode = possible_modes[0]




In [3]:
#### DATA LOADER ####
from torch_geometric.data import download_url, extract_zip
from torch import Tensor
import pandas as pd
import numpy as np
import torch

def data_loader(ratings_df):
    unique_user_id = ratings_df['userId'].unique()
    unique_user_id = pd.DataFrame(data={
        'userId': unique_user_id,
        'mappedID': pd.RangeIndex(len(unique_user_id)),
    })
    # print("Mapping of user IDs to consecutive values:")
    # print("==========================================")
    # print(unique_user_id.head())

    unique_item_id = ratings_df['itemId'].unique()
    unique_item_id = pd.DataFrame(data={
        'itemId': unique_item_id,
        'mappedID': pd.RangeIndex(len(unique_item_id)),
    })
    # print("Mapping of item IDs to consecutive values:")
    # print("===========================================")
    # print(unique_item_id.head())

    ratings_user_id = pd.merge(ratings_df['userId'], unique_user_id,
                                left_on='userId', right_on='userId', how='left')
    ratings_user_id = torch.from_numpy(ratings_user_id['mappedID'].values)
    ratings_item_id = pd.merge(ratings_df['itemId'], unique_item_id,
                                left_on='itemId', right_on='itemId', how='left')
    ratings_item_id = torch.from_numpy(ratings_item_id['mappedID'].values)
    edge_index_user_to_item = torch.stack([ratings_user_id, ratings_item_id], dim=0)
    # print()
    # print("Final edge indices pointing from users to items:")
    # print("=================================================")
    # print(edge_index_user_to_item)
    return unique_user_id, unique_item_id, edge_index_user_to_item

def movie_loader():
    url = 'https://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
    extract_zip(download_url(url, '.'), '.')
    movies_path = './ml-latest-small/movies.csv'
    ratings_path = './ml-latest-small/ratings.csv'
    items_ratings_df = pd.read_csv(ratings_path)
    items_ratings_df = items_ratings_df.rename(columns={'movieId': 'itemId'})
    unique_user_id, unique_item_id, edge_index_user_to_item = data_loader(items_ratings_df)
    items_df = pd.read_csv(movies_path)
    items_df = items_df.rename(columns={'movieId': 'itemId', 'title': 'name'})
    items_df = pd.merge(items_df, unique_item_id, on='itemId', how='left')
    items_df = items_df.sort_values('mappedID') # (Just the last 20 movies have NaN mappedId)
    genres = items_df['genres'].str.get_dummies('|')
    print(genres[["Action", "Adventure", "Drama", "Horror"]].head())
    item_feat = torch.from_numpy(genres.values).to(torch.float)
    assert item_feat.size() == (9742, 20)  # 20 genres in total.
    return unique_user_id, unique_item_id, edge_index_user_to_item, items_df, item_feat, items_ratings_df

def movie_loader_sparse(k):
    # Load Data
    url = 'https://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
    extract_zip(download_url(url, '.'), '.')
    movies_path = './ml-latest-small/movies.csv'
    ratings_path = './ml-latest-small/ratings.csv'

    # Read and rename columns
    items_ratings_df = pd.read_csv(ratings_path)
    items_ratings_df = items_ratings_df.rename(columns={'movieId': 'itemId'})
    items_df = pd.read_csv(movies_path)
    items_df = items_df.rename(columns={'movieId': 'itemId', 'title': 'name'})

    # Select k% of each user's ratings
    items_ratings_df = items_ratings_df.groupby('userId').apply(lambda x: x.sample(frac=k/100)).reset_index(drop=True)

    # Keep only items present in the filtered ratings
    valid_item_ids = items_ratings_df['itemId'].unique()
    items_df = items_df[items_df['itemId'].isin(valid_item_ids)]

    # Recompute unique_user_id, unique_item_id, edge_index_user_to_item
    unique_user_id, unique_item_id, edge_index_user_to_item = data_loader(items_ratings_df)

    # Merge and sort items data
    items_df = pd.merge(items_df, unique_item_id, on='itemId', how='left')
    items_df = items_df.sort_values('mappedID')

    # Process genres and create item features
    genres = items_df['genres'].str.get_dummies('|')
    item_feat_movielens = torch.from_numpy(genres.values).to(torch.float)

    # Ensure the item feature size is as expected
    assert item_feat_movielens.size() == (len(valid_item_ids), 20)  # 20 genres in total.

    item_feat_zeros = torch.zeros_like(item_feat_movielens)
    item_feats_dict = {
        'item_feat_zeros': item_feat_zeros, 
        'item_feat_movielens': item_feat_movielens
        }

    return unique_user_id, unique_item_id, edge_index_user_to_item, items_df, item_feats_dict, items_ratings_df


def contract_loader():
    items_ratings_df = pd.read_parquet('dataset/user_contract_rating.parquet')
    items_ratings_df = items_ratings_df[:len_interactions_to_consider] if mode == 'debug' else items_ratings_df #$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
    
    def calculate_sparcity_value(df):
        num_users = df['user'].nunique()
        num_items = df['item'].nunique()
        num_interactions = len(df)
        total_possible_interactions = num_users * num_items / 100
        sparsity = 1 - (num_interactions / total_possible_interactions)
        return sparsity
    
    def filter_interactions(df, column, k):
        valid_entries = df[column].value_counts()
        valid_entries = valid_entries[valid_entries >= k]
        df = df[df[column].isin(valid_entries.index)]
        print(f'{column} sparcity value is:', calculate_sparcity_value(df))
        return df

    ########## SPARCITY EXPERIMENT ###########
    if experiment == 'usparsity':
        u = 1
        items_ratings_df = filter_interactions(items_ratings_df, 'user', u)
    elif experiment == 'isparsity':
        i = 20
        items_ratings_df = filter_interactions(items_ratings_df, 'item', i)

    items_df = {}
    items_df['name'] = items_ratings_df['item'].unique()
    items_df['itemId'], unique_names = pd.factorize(items_df['name'])
    # items_df['itemId'] = items_df['itemId'] + 1 #TODO test commenting this line didn't breal anything
    items_df = pd.DataFrame(items_df, columns=['itemId', 'name'])

    def get_item_feat_sbert(items_df):
        contract2comments = pd.read_parquet('dataset/contracts2comment.parquet')
        c2c_main_class = contract2comments[contract2comments['contract_name'] == contract2comments['class_name']]

        def reorder_text(text):
            lines = text.split("\n")
            notice_lines = [line for line in lines if "@notice" in line]
            other_lines = [line for line in lines if "@notice" not in line]
            reorderd_text = "\n".join(notice_lines + other_lines)
            return reorderd_text

        def preprocess_text(text):
            text = reorder_text(text)
            text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
            # Remove special characters, numbers, etc.
            text = re.sub(r'\W', ' ', text)
            # Remove extra spaces
            text = re.sub(r'\s+', ' ', text).strip()
            text = text[:512] if len(text) > 512 else text
            return text

        sentences = []
        for i, item in items_df.iterrows():
            comment_class = c2c_main_class[c2c_main_class['contract_name'] == item['name']]
            if not comment_class.empty and comment_class['class_documentation'].iloc[0] != '':
                sentences.append(comment_class['class_documentation'].iloc[0])
            else:
                class_names = contract2comments[contract2comments['contract_name'] == item['name']]['class_name']
                sentences.append(' '.join(class_names))

        preprocessed_sentences = [preprocess_text(sentence) for sentence in sentences]
        tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/bert-base-nli-mean-tokens")
        model = AutoModel.from_pretrained("sentence-transformers/bert-base-nli-mean-tokens")
        device = torch.device("cpu") #"cuda" if torch.cuda.is_available() else "cpu") # NOT enough GPU memory
        model = model.to(device)
        inputs = tokenizer(preprocessed_sentences, padding=True, truncation=True, return_tensors="pt", max_length=512)
        inputs = {key: tensor.to(device) for key, tensor in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1)
        item_feat = embeddings
        # model = SentenceTransformer('sentence-transformers/distilbert-base-nli-mean-tokens')
        # embeddings = model.encode(preprocessed_sentences)
        
        return item_feat
    
    def get_item_feat_tfidf(items_df):
        contract_top_words_df = pd.read_parquet('dataset/contract_top_words.parquet')
        contract_top_words_df = contract_top_words_df.rename(columns={'contract_name': 'name'})
        contracts_df_top_words = items_df.merge(contract_top_words_df, on='name', how='left')
        contracts_df_top_words['keywords'] = contracts_df_top_words['keywords'].fillna('')
        items_df = contracts_df_top_words
        items_df.set_index('itemId', inplace=True)
        # f =5 # ratio to determine the number of top keywords selected for each contract to construct item_feat
        items_df['truncated_keywords'] = items_df['keywords'].apply(lambda x: ','.join(x.split(',')))
        X_df = items_df['truncated_keywords'].str.get_dummies(',')
        item_feat = torch.from_numpy(X_df.values).to(torch.float)
        return item_feat
    
    def get_item_feat_clustering(items_df):
        contract_to_topic_df = pd.read_parquet("dataset/contract_name_topic.parquet")
        items_df = pd.merge(items_df, contract_to_topic_df, left_on='name', right_on='contract_name', how='left')
        items_df = items_df.rename(columns={'most_probable_topic': 'clusterId'})
        items_df['clusterId'] = items_df['clusterId'].fillna(0).astype(int)
        num_clusters = int(max(items_df['clusterId'])) + 1
        def one_hot(cluster_id, num_clusters):
            vec = torch.zeros(num_clusters)
            vec[int(cluster_id)] = 1
            return vec
        item_feat = torch.stack([one_hot(cid, num_clusters) for cid in items_df['clusterId']])
        return item_feat
        
    item_feat_clustering = get_item_feat_clustering(items_df)
    # item_feat_tfidf = get_item_feat_tfidf(items_df)
    item_feat_tfidf = np.load('dataset/tfidf_embeddings_100k.npy') # np.load('tfidf_embeddings_100k.npy')
    item_feat_tfidf = torch.from_numpy(item_feat_tfidf[:len(items_df['itemId'].unique())]).to(torch.float)
    # item_feat_sbert = get_item_feat_sbert(items_df)
    item_feat_sbert = np.load('dataset/sbert_embeddings_full.npy') #np.load('sbert_embeddings_100k.npy')
    item_feat_sbert = torch.from_numpy(item_feat_sbert[:len(items_df['itemId'].unique())]).to(torch.float)

    items_ratings_df = items_ratings_df.rename(columns={'user': 'userId', 'item': 'itemId'})
    unique_user_id, unique_item_id, edge_index_user_to_item = data_loader(items_ratings_df)
    print('number of unique users', len(unique_user_id))
    print('number of unique items', len(unique_item_id))
    # TODO: create item_feat_df with itemId and item_feat tensor as columns
    item_features_sbert_list = [feat.tolist() for feat in item_feat_sbert]
    item_feat_sbert_df = pd.DataFrame({
        'itemId': unique_item_id['mappedID'],
        'itemFeature': item_features_sbert_list
    })
    item_feat_zeros = torch.zeros_like(item_feat_sbert)
    item_feats_dict = {
        'item_feat_zeros': item_feat_zeros, 
        'item_feat_clustering': item_feat_clustering, 
        'item_feat_tfidf': item_feat_tfidf, 
        'item_feat_sbert': item_feat_sbert
        }

    return unique_user_id, unique_item_id, edge_index_user_to_item, items_df, item_feats_dict, item_feat_sbert_df, items_ratings_df

if dataset_mode == 'contract':
    unique_user_id, unique_item_id, edge_index_user_to_item, items_df, item_feats_dict, item_feat_sbert_df, items_ratings_df = contract_loader()
if dataset_mode == 'movie':
    def calculate_sparsity(edge_index_user_to_item):
        num_users = edge_index_user_to_item[0].max() + 1  # Assuming user IDs start from 0
        num_items = edge_index_user_to_item[1].max() + 1  # Assuming item IDs start from 0
        num_interactions = edge_index_user_to_item.shape[1]
        total_possible_interactions = num_users * num_items
        sparsity = 1 - (num_interactions / total_possible_interactions)

        return sparsity

    unique_user_id, unique_item_id, edge_index_user_to_item, items_df, item_feats_dict, items_ratings_df = movie_loader_sparse(30)
    print(len(unique_item_id))
    print(len(edge_index_user_to_item[0])/len(unique_user_id))
    print(calculate_sparsity(edge_index_user_to_item))


number of unique users 10761
number of unique items 17310


In [4]:
######### TRAIN_TEST_GEN ##########

def train_test_generator(unique_user_id, item_feat, edge_index_user_to_item):  
    data = HeteroData()
    data["user"].node_id = torch.arange(len(unique_user_id))
    data["item"].node_id = torch.arange(item_feat.shape[0])
    data["item"].x = item_feat
    data["user", "rates", "item"].edge_index = edge_index_user_to_item
    # data = T.ToUndirected()(data)

    transform = T.RandomLinkSplit(
        num_val=0,
        num_test=0.2,
        disjoint_train_ratio=0.3,
        neg_sampling_ratio=0, #2
        add_negative_train_samples=False,
        edge_types=("user", "rates", "item"),
        rev_edge_types=("item", "rev_rates", "user"), 
    )
    
    train_data, _, test_data = transform(data)
    return data, train_data, test_data

def custom_train_test_generator(unique_user_id, unique_item_id, item_feat, edge_index_user_to_item):

    data = HeteroData()
    data["user"].node_id = torch.arange(len(unique_user_id))
    data["item"].node_id = torch.arange(item_feat.shape[0])
    data["item"].x = item_feat
    data["user", "rates", "item"].edge_index = edge_index_user_to_item
    data = T.ToUndirected()(data)

    unique_user_id.sample(frac=1).reset_index(drop=True)
    num_users = len(unique_user_id)
    train_data, test_data = HeteroData(), HeteroData()
    for data in [train_data, test_data]:
        data["user"].node_id = torch.arange(num_users)
        data["item"].node_id = torch.arange(item_feat.shape[0])
        data["item"].x = item_feat

    if experiment == 'ucsp':
        # train_users = unique_user_id[:int(0.8 * num_users)]
        # test_users = unique_user_id[int(0.8 * num_users):]
        # train_edges = edge_index_user_to_item[:, np.isin(edge_index_user_to_item[0], train_users)]
        # test_edges = edge_index_user_to_item[:, np.isin(edge_index_user_to_item[0], test_users)]
        # np.random.shuffle(train_edges.T)  # TODO: update theb shuffling using torch.randperm() method
        # num_train_edges = train_edges.shape[1]
        # disjoint_train_edges = train_edges[:, :int(0.3 * num_train_edges)]  # Disjoint part for embedding 240
        # remaining_train_edges = train_edges[:, int(0.3 * num_train_edges):]  # Remaining for link prediction 560
        raise NotImplementedError
    elif experiment == 'icsp':
        raise NotImplementedError
    else:
        # Note:we can shuffle edges without considering the lables, since they are all ones before adding neg samples
        def create_folds(edge_index_user_to_item, n_folds=5):
            num_edges = edge_index_user_to_item.size(1)
            batch_size = num_edges // n_folds
            folds = []

            for i in range(n_folds):
                start_idx = i * batch_size
                end_idx = num_edges if i == n_folds - 1 else start_idx + batch_size
                test_edges = edge_index_user_to_item[:, start_idx:end_idx]
                train_edges = torch.cat([edge_index_user_to_item[:, :start_idx], edge_index_user_to_item[:, end_idx:]], dim=1)
                disjoint_size = int(0.3 * train_edges.shape[1])
                disjoint_train_edges = train_edges[:, :disjoint_size]
                remaining_train_edges = train_edges[:, disjoint_size:]
                fold = {
                    'train_edges': train_edges,
                    'test_edges': test_edges,
                    'disjoint_train_edges': disjoint_train_edges,
                    'remaining_train_edges': remaining_train_edges
                }
                folds.append(fold)

            return folds

        indices = torch.randperm(edge_index_user_to_item.size(1))
        edge_index_user_to_item = edge_index_user_to_item[:, indices]
        folds = create_folds(edge_index_user_to_item)

    # def generate_negative_samples_fast(all_edges, test_users, num_items, num_neg_samples):
    #     existing_edges = set(map(tuple, all_edges.t().numpy()))
    #     min_user_id = test_users['mappedID'].min()
    #     max_user_id = test_users['mappedID'].max()
    #     neg_samples = []
    #     while len(neg_samples) < num_neg_samples:
    #         random_users = torch.randint(min_user_id, max_user_id, (num_neg_samples,))
    #         random_items = torch.randint(0, num_items, (num_neg_samples,))
    #         candidates = torch.stack([random_users, random_items], dim=1)
    #         for candidate in candidates:
    #             if tuple(candidate.tolist()) not in existing_edges:
    #                 neg_samples.append(candidate.tolist())
    #                 if len(neg_samples) == num_neg_samples:
    #                     break
    #     return torch.tensor(neg_samples)

    # num_positive_samples = test_edges.shape[1]
    # num_negative_samples = num_positive_samples * 3
    # neg_samples = generate_negative_samples_fast(edge_index_user_to_item, test_users, len(unique_item_id), num_negative_samples)
    # test_data["user", "rates", "item"].edge_label_index = torch.cat([test_edges, neg_samples.t()], dim=1)
    # test_data["user", "rates", "item"].edge_label = torch.cat([torch.ones(num_positive_samples), torch.zeros(num_negative_samples)])
    # test_data["user", "rates", "item"].edge_index = test_data["user", "rates", "item"].edge_label_index

    return data, train_data, test_data, folds


In [5]:
######### GNN MODEL ##########
def GNN_recommender(data, train_data):

    # Define seed edges:
    print('1')
    edge_label_index = train_data["user", "rates", "item"].edge_label_index
    edge_label = train_data["user", "rates", "item"].edge_label
    print('2')
    train_loader = LinkNeighborLoader(
        data=train_data,
        num_neighbors=[20, 10],
        neg_sampling_ratio=2.0,
        edge_label_index=(("user", "rates", "item"), edge_label_index),
        edge_label=edge_label,
        batch_size=128,
        shuffle=True,
    )
    print('3')

    class GNN(torch.nn.Module):
        def __init__(self, hidden_channels):
            super().__init__()
            self.conv1 = SAGEConv(hidden_channels, hidden_channels)
            self.conv2 = SAGEConv(hidden_channels, hidden_channels)
        def forward(self, x: Tensor, edge_index: Tensor) -> Tensor:
            x = F.relu(self.conv1(x, edge_index))
            x = self.conv2(x, edge_index)
            return x
    # Our final classifier applies the dot-product between source and destination
    # node embeddings to drive edge-level predictions:
    class Classifier(torch.nn.Module):
        def forward(self, x_user: Tensor, x_item: Tensor, edge_label_index: Tensor) -> Tensor:
            edge_feat_user = x_user[edge_label_index[0]] # Convert node embeddings to edge-level representations:
            edge_feat_item = x_item[edge_label_index[1]]
            scores = (edge_feat_user * edge_feat_item).sum(dim=-1)
            return scores # Apply dot-product to get a prediction per supervision edge:
        
    class Model(torch.nn.Module):
        def __init__(self, hidden_channels):
            super().__init__()
            # Since the dataset does not come with rich features, we also learn two
            # embedding matrices for users and items:
            self.item_lin = torch.nn.Linear(data['item'].x.shape[1], hidden_channels)
            self.user_emb = torch.nn.Embedding(data["user"].num_nodes, hidden_channels)
            self.item_emb = torch.nn.Embedding(data["item"].num_nodes, hidden_channels)
            # Instantiate homogeneous GNN:
            self.gnn = GNN(hidden_channels)
            # Convert GNN model into a heterogeneous variant:
            self.gnn = to_hetero(self.gnn, metadata=data.metadata())
            self.classifier = Classifier()

        def forward(self, data: HeteroData) -> Tensor:
            x_dict = {
            "user": self.user_emb(data["user"].node_id),
            "item": self.item_lin(data["item"].x) + self.item_emb(data["item"].node_id),
            } 
            # `x_dict` holds feature matrices of all node types
            # `edge_index_dict` holds all edge indices of all edge types
            x_dict = self.gnn(x_dict, data.edge_index_dict)
            pred = self.classifier(
                x_dict["user"],
                x_dict["item"],
                data["user", "rates", "item"].edge_label_index,
            )
            return pred
            
    ########## TRAINING ##########
    model = Model(hidden_channels=64)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Device: '{device}'")
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    for epoch in range(1, 10):
        total_loss = total_examples = 0
        for sampled_data in tqdm(train_loader):
            optimizer.zero_grad()
            sampled_data.to(device)
            pred = model(sampled_data)
            ground_truth = sampled_data["user", "rates", "item"].edge_label
            loss = F.binary_cross_entropy_with_logits(pred, ground_truth)
            loss.backward()
            optimizer.step()
            total_loss += float(loss) * pred.numel()
            total_examples += pred.numel()

        # TODO: Add the val_loader, keep the best model
        print(f"Epoch: {epoch:03d}, Loss: {total_loss / total_examples:.4f}")

    ########## AUC EVAL VALIDATION #########
    # edge_label_index = val_data["user", "rates", "item"].edge_label_index
    # edge_label = val_data["user", "rates", "item"].edge_label
    # # val_data has neg samples in it
    # val_loader = LinkNeighborLoader(
    #     data=val_data,
    #     num_neighbors=[20, 10],
    #     edge_label_index=(("user", "rates", "item"), edge_label_index),
    #     edge_label=edge_label,
    #     batch_size=3 * 128,
    #     shuffle=False,
    # )
    # sampled_data = next(iter(val_loader))
    # preds = []
    # ground_truths = []
    # for sampled_data in tqdm(val_loader):
    #     with torch.no_grad():
    #         sampled_data.to(device)
    #         preds.append(model(sampled_data))
    #         ground_truths.append(sampled_data["user", "rates", "item"].edge_label)
    # pred = torch.cat(preds, dim=0).cpu().numpy()
    # ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()
    # auc = roc_auc_score(ground_truth, pred)
    # print()
    # print(f"Validation AUC: {auc:.4f}")
    # return data, train_data, val_data, train_loader, val_loader, ground_truth, pred, test_data, model
    return model

In [6]:
########## GNN TRAINING ############
#TODO: reactor below
def gnn_train(dataset_mode, data, train_data, item_feats_dict):
    models = {}

    if dataset_mode == 'contract':
        cf_gnn = GNN_recommender(data, train_data)

        data['item'].x = item_feats_dict['item_feat_clustering']
        train_data['item'].x = item_feats_dict['item_feat_clustering']
        hybrid_gnn_clustering = GNN_recommender(data, train_data)

        data['item'].x = item_feats_dict['item_feat_tfidf']
        train_data['item'].x = item_feats_dict['item_feat_tfidf']
        hybrid_gnn_tfidf = GNN_recommender(data, train_data)

        data['item'].x = item_feats_dict['item_feat_sbert']
        train_data['item'].x = item_feats_dict['item_feat_sbert']
        hybrid_gnn_sbert = GNN_recommender(data, train_data)

        models = {
            'cf_gnn': cf_gnn, 
            'hybrid_gnn_clustering': hybrid_gnn_clustering, 
            'hybrid_gnn_tfidf': hybrid_gnn_tfidf, 
            'hybrid_gnn_sbert': hybrid_gnn_sbert
            }

    elif dataset_mode == 'movie':
        # Note: here we can add pure CF GNN for movielens, just uncomment below
        cf_gnn_movielens = GNN_recommender(data, train_data)

        data['item'].x = item_feats_dict['item_feat_movielens']
        train_data['item'].x = item_feats_dict['item_feat_movielens']
        hybrid_gnn_movielens = GNN_recommender(data, train_data)

        models = {
            'cf_gnn_movielens': cf_gnn_movielens, 
            'hybrid_gnn_movielens': hybrid_gnn_movielens
            }

    return models

        


In [7]:
####### NOT-COMPLETED CSP EXPRIMENTS-OLD #######
# Notel the problem with this approach: we simply remove items or users from testset (after generating test and train) that are in train. clearly removing part of user 
### CSP #### note: if the ratio==1, rerun from the first step
if experiment == 'ucsp' or experiment == 'icsp':
    def csp_test_gen(train_data, test_data, unique_data, entity_index, experiment_abbr):
        train_data_unique_entities = set(train_data['user', 'rates', 'item'].edge_label_index[entity_index].unique().numpy())
        unique_entities = set(unique_data['mappedID'].unique())
        entities_not_in_train = unique_entities - train_data_unique_entities
        print(len(unique_entities), len(entities_not_in_train))
        mask = torch.tensor([entity in entities_not_in_train for entity in test_data["user", "rates", "item"].edge_label_index[entity_index].numpy()])
        
        test_data_filtered = copy.deepcopy(test_data)
        test_data_filtered["user", "rates", "item"].edge_label_index = test_data_filtered["user", "rates", "item"].edge_label_index[:, mask]
        test_data_filtered["user", "rates", "item"].edge_label = test_data_filtered["user", "rates", "item"].edge_label[mask]
        
        ratio = len(test_data_filtered["user", "rates", "item"].edge_label_index[entity_index]) / len(test_data["user", "rates", "item"].edge_label_index[entity_index])
        print(f'test to train ratio {experiment_abbr}', ratio)
        
        return test_data_filtered, ratio

    test_data_csp, test_to_train_ratio_csp = csp_test_gen(
        train_data, test_data, unique_user_id if experiment == 'ucsp' else unique_item_id, 0 if experiment == 'ucsp' else 1, 'CSP-user' if experiment == 'ucsp' else 'CSP-item'
    )
    print('test data len BEFOR CSP test gen:', len(test_data['user', 'rates', 'item'].edge_label_index[0]))
    print('test data len AFTER CSP test gen:', len(test_data_csp['user', 'rates', 'item'].edge_label_index[0]))


In [8]:
######## ALL_TO_ALL USER_ITEM PAIRS GENERATOR IN TEST_DATA #########
# if experiment == 'ucsp' or experiment == 'icsp': 
#     test_data_temp = copy.deepcopy(test_data_csp)
# test_data_all2all = copy.deepcopy(test_data_temp) # Now we're testing the Transform neg_sampeling instead of manually doing that
# if experiment == 'add_social_edges': all_items = [item for item in all_items if item > len(all_users)]# which helps keep the social_edges in test and be evaluated or remove social_edges in test_set?

def add_neg_samples(data, test_data):
    test_data_temp = copy.deepcopy(test_data)
    edge_index_zip = set(zip(data["user", "rates", "item"].edge_index[0].numpy(), data["user", "rates", "item"].edge_index[1].numpy()))
    test_users = test_data_temp["user", "rates", "item"].edge_label_index[0].unique().numpy()
    all_items = data["user", "rates", "item"].edge_index[1].unique().numpy()

    new_edges = []
    new_labels = []
    edge_index_set = set(edge_index_zip)

    for user_id in tqdm(test_users, total=len(test_users)):
        count_user_new_edges = 0
        random.shuffle(all_items)
        for item_id in all_items:
            if count_user_new_edges > each_user_all2all_new_edges:
                break
            if (user_id, item_id) not in edge_index_set:
                count_user_new_edges += 1
                new_edges.append((user_id, item_id))
                new_labels.append(0)

    test_data_all2all = copy.deepcopy(test_data_temp)
    if new_edges:
        new_edges_tensor = torch.tensor(new_edges, dtype=torch.int64).t().contiguous()
        new_labels_tensor = torch.tensor(new_labels, dtype=torch.int64)
        test_data_all2all["user", "rates", "item"].edge_label_index = torch.cat((test_data_all2all["user", "rates", "item"].edge_label_index, new_edges_tensor), dim=1)
        test_data_all2all["user", "rates", "item"].edge_label = torch.cat((test_data_all2all["user", "rates", "item"].edge_label, new_labels_tensor), dim=0)

    print('test edges shape BEFORE adding all possible user item pairs', test_data_temp["user", "rates", "item"].edge_label_index.shape)
    print('test edges shape AFTER adding all possible user item pairs', test_data_all2all["user", "rates", "item"].edge_label_index.shape)
    print('unique test users', len(test_data_all2all["user", "rates", "item"].edge_label_index[0].unique()))
    print('unique test items', len(test_data_all2all["user", "rates", "item"].edge_label_index[1].unique()))

    return test_data_all2all




In [9]:
######## GNN PRED FOR test_data_w_neg_samples ######### 
import time
def gnn_pred(models, test_data_w_neg_samples, model_to_item_feat_dict):
    
    def _gnn_pred(model_gnn, test_data_w_neg_samples, item_feat):
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        test_data_w_neg_samples['item'].x = item_feat
        test_loader_gnn = LinkNeighborLoader(
            data=test_data_w_neg_samples,
            num_neighbors=[20, 10],
            edge_label_index=(("user", "rates", "item"), test_data_w_neg_samples["user", "rates", "item"].edge_label_index),
            edge_label=test_data_w_neg_samples["user", "rates", "item"].edge_label,
            batch_size= 3 * 128, # 3*128 TO calculate latency on inference time, use 17 as batch size, this will yield 10400 predictions that is near to MF number of preds (=10600)
            shuffle=False,
        )
        sampled_data_gnn = next(iter(test_loader_gnn))
        preds_gnn = []
        ground_truths_gnn = []
        model_gnn = model_gnn.to(device)
        for sampled_data_gnn in tqdm(test_loader_gnn):
            with torch.no_grad():
                sampled_data_gnn.to(device)
                preds_gnn.append(model_gnn(sampled_data_gnn))
                ground_truths_gnn.append(sampled_data_gnn["user", "rates", "item"].edge_label)
        pred_gnn = torch.cat(preds_gnn, dim=0).cpu().numpy()
        ground_truth_gnn = torch.cat(ground_truths_gnn, dim=0).cpu().numpy()
        print('all ground truth len', len(ground_truth_gnn))
        return pred_gnn, ground_truth_gnn
    
    preds = {}
    for model_name, model in models.items():
        if model_name not in preds:
            preds[model_name] = {}
        start_time = time.time()
        preds[f'{model_name}'][f'pred_{model_name}'], preds[f'{model_name}'][f'ground_truth_{model_name}'] = _gnn_pred(model, test_data_w_neg_samples, model_to_item_feat_dict[f'{model_name}'])
        end_time = time.time()
        print(f'required time for {model_name} inference on testset', end_time - start_time)
        # %memit pred_gnn_gen(device, test_data_w_neg_samples) %Uncomment to check the memory usage of GNN on inference

    return preds
    

In [10]:
########### DATA PREPRATION FOR MF & POP MODELS  #############
'''
For LightFM models, we need a df of train and test data, 
but from GNN train/test generation, we have a HeteroData
Here we turn a HeteroData to a DataFrame
'''

def mf_data_prep(test_data_w_neg_samples, train_data, unique_item_id):
    def add_clusterID(df, contract_to_topic_df, unique_item_id):
        item_to_topic = pd.Series(contract_to_topic_df['most_probable_topic'].values, index=contract_to_topic_df['contract_name']).to_dict()
        mappedID_to_itemId = pd.Series(unique_item_id['itemId'].values, index=unique_item_id['mappedID']).to_dict()
        df['item_name'] = df['item'].map(mappedID_to_itemId)
        df['clusterId'] = df['item_name'].map(item_to_topic).fillna(0).astype(int)
        df = df.drop(columns=['item_name'])

        return df

    test_df_index = test_data_w_neg_samples['user', 'rates', 'item'].edge_label_index.numpy()
    test_df_label = test_data_w_neg_samples['user', 'rates', 'item'].edge_label.numpy()

    test_df_index = test_df_index.T 
    test_df_mf = pd.DataFrame(test_df_index, columns=['user', 'item'])
    test_df_mf['rating'] = test_df_label


    train_df_index = train_data['user', 'rates', 'item'].edge_label_index.numpy()
    train_df_label = train_data['user', 'rates', 'item'].edge_label.numpy()
    train_df_index = train_df_index.T 
    train_df_mf = pd.DataFrame(train_df_index, columns=['user', 'item'])
    train_df_mf['rating'] = train_df_label

    if dataset_mode == 'contract':
        contract_to_topic_df = pd.read_parquet("dataset/contract_name_topic.parquet")
        train_df_mf= add_clusterID(train_df_mf, contract_to_topic_df, unique_item_id)
        test_df_mf = add_clusterID(test_df_mf, contract_to_topic_df, unique_item_id)

    return train_df_mf, test_df_mf

In [11]:
########### POP & MF_N & MF_C TRAIN/PRED  #############
def pop_pred(preds, train_df_mf, test_df_mf):
    if 'pop' not in preds:
        preds['pop'] = {}
    top_contracts = train_df_mf['item'].value_counts()[:20].index.tolist() #TODO: change to be determined dinamically based on k
    test_df_mf['pred_pop'] = 0
    test_df_mf.loc[test_df_mf['item'].isin(top_contracts), 'pred_pop'] = 1
    preds['pop']['pred_pop'] = test_df_mf['pred_pop'].to_numpy()
    preds['pop']['ground_truth_pop'] = test_df_mf['rating'].to_numpy()

    return preds

def mf_pred(preds, train_df_mf, test_df_mf, item_feat_sbert_df):
    def initialize_dataset(train_df, test_df, additional_features=None):
        dataset = Dataset()
        user_ids = np.union1d(train_df['user'].unique(), test_df['user'].unique())
        item_ids = np.union1d(train_df['item'].unique(), test_df['item'].unique())
        if additional_features is not None:
            dataset.fit(users=user_ids, items=item_ids, item_features=additional_features)
        else:
            dataset.fit(users=user_ids, items=item_ids)
        return dataset

    def build_model_and_predict(train_df, test_df, dataset, model_params, additional_features=None):
        user_ids_mapping, _, item_ids_mapping, _ = dataset.mapping()

        train_interactions, _ = dataset.build_interactions(
            (row['user'], row['item'], row['rating']) for index, row in train_df.iterrows())

        model = LightFM(**model_params)
        if additional_features is not None:
            item_features = dataset.build_item_features(
                (x, [y]) for x, y in zip(train_df['item'], additional_features))
            model.fit(train_interactions, item_features=item_features, epochs=30, num_threads=2)
        else:
            model.fit(train_interactions, epochs=30, num_threads=2)

        test_df['predictions'] = 0.0
        start_time = time.time()
        for user, user_data in tqdm(test_df.groupby('user'), total=test_df['user'].nunique()):
            user_id_internal = user_ids_mapping[user]
            item_ids_internal = np.array([item_ids_mapping[item] for item in user_data['item']])
            predictions = model.predict(user_id_internal, item_ids_internal)
            test_df.loc[user_data.index, 'predictions'] = predictions
        end_time = time.time()
        print('Inference time:', end_time - start_time)

        return test_df['predictions'].to_numpy(), test_df['rating'].to_numpy()

    def cf_mf_pred_gen(train_df, test_df):
        dataset = initialize_dataset(train_df, test_df)
        return build_model_and_predict(train_df, test_df, dataset, {'loss': 'warp'})

    def hybrid_mf_clustering_pred_gen(train_df, test_df):
        cluster_ids = np.union1d(train_df['clusterId'].unique(), test_df['clusterId'].unique())
        dataset = initialize_dataset(train_df, test_df, additional_features=cluster_ids)
        return build_model_and_predict(train_df, test_df, dataset, {'loss': 'warp'}, additional_features=train_df['clusterId'])

    def hybrid_mf_sbert_pred_gen(train_df, test_df, item_feat_df):
        feature_range = range(len(item_feat_df['itemFeature'][0]))
        dataset = initialize_dataset(train_df, test_df, additional_features=feature_range)
        return build_model_and_predict(train_df, test_df, dataset, {'loss': 'warp'}, additional_features=feature_range)

    if 'cf_mf' not in preds: preds['cf_mf'] = {}
    preds['cf_mf']['pred_cf_mf'], preds['cf_mf']['ground_truth_cf_mf'] = cf_mf_pred_gen(train_df_mf, test_df_mf)

    if 'hybrid_mf_clustering' not in preds: preds['hybrid_mf_clustering'] = {}
    preds['hybrid_mf_clustering']['pred_hybrid_mf_clustering'], preds['hybrid_mf_clustering']['ground_truth_hybrid_mf_clustering'] = hybrid_mf_clustering_pred_gen(train_df_mf, test_df_mf)

    if 'hybrid_mf_sbert' not in preds: preds['hybrid_mf_sbert'] = {}
    preds['hybrid_mf_sbert']['pred_hybrid_mf_sbert'], preds['hybrid_mf_sbert']['ground_truth_hybrid_mf_sbert'] = hybrid_mf_sbert_pred_gen(train_df_mf, test_df_mf, item_feat_sbert_df)

    return preds

    



In [12]:
####### METRIC EVAL #######
def preds_eval(test_data_w_neg_samples, preds, fold_counter):
    def precision_at_k(user_id, sorted_indices, ground_truth, k):
        top_k_indices = sorted_indices[:k]
        top_k_labels = ground_truth[top_k_indices]
        
        # Check if there's any relevant item in the top k recommendations
        hit = int(np.sum(top_k_labels) > 0)

        return hit

    def average_hit_at_k(k, ground_truth, pred, user_ids, edge_index, model_variant):
        hits = []
        for user_id in user_ids: 
            mask = edge_index[0] == user_id
            filtered_pred = pred[mask]
            filtered_ground_truth = ground_truth[mask]
            sorted_indices = np.argsort(filtered_pred)[::-1]
            # pop_hit = np.sum(filtered_pred[:np.sum(filtered_ground_truth == 1)] > 0) / (min(np.sum(filtered_ground_truth == 1), k) if np.sum(filtered_ground_truth == 1) != 0 else k)
            # hits.append(precision_at_k(user_id, sorted_indices, filtered_ground_truth, k) if model_variant != 'pop' else pop_hit)
            hits.append(precision_at_k(user_id, sorted_indices, filtered_ground_truth, k))
            
        return np.mean(hits)

    def dcg_at_k(r, k):
        r = np.asfarray(r)[:k]
        return np.sum(r / np.log2(np.arange(2, r.size + 2)))

    def ndcg_at_k(r, k):
        dcg_max = dcg_at_k(sorted(r, reverse=True), k)
        if not dcg_max:
            return 0.
        return dcg_at_k(r, k) / dcg_max

    def calculate_ndcg_at_k(k, ground_truth, pred, user_ids, edge_index):
        ndcgs = []
        for user_id in user_ids: # tqdm(user_ids, total=len(user_ids)):
            mask = edge_index[0] == user_id
            filtered_pred = pred[mask]
            filtered_ground_truth = ground_truth[mask]
            
            # Sort by predicted score
            sorted_indices = np.argsort(filtered_pred)[::-1]
            sorted_ground_truth = filtered_ground_truth[sorted_indices]
            
            ndcgs.append(ndcg_at_k(sorted_ground_truth, k))
            
        return np.mean(ndcgs)

    def average_precision_at_k(user_id, sorted_indices, ground_truth, k):
        top_k_indices = sorted_indices[:k]
        top_k_labels = ground_truth[top_k_indices]
        
        relevant_indices = np.where(top_k_labels > 0)[0]
        num_relevant = len(relevant_indices)
        
        if num_relevant == 0:
            return 0
        
        score = 0.0
        for i in relevant_indices:
            prec_at_i = np.sum(top_k_labels[:i+1]) / (i + 1)
            score += prec_at_i
        
        return score / min(num_relevant, k)

    def mean_ap_at_k(k, ground_truth, pred, user_ids, edge_index):
        average_precisions = []
        for user_id in user_ids: # tqdm(user_ids, total=len(user_ids)):
            mask = edge_index[0] == user_id
            filtered_pred = pred[mask]
            filtered_ground_truth = ground_truth[mask]
            sorted_indices = np.argsort(filtered_pred)[::-1]
            
            average_precisions.append(
                average_precision_at_k(user_id, sorted_indices, filtered_ground_truth, k)
            )
            
        return np.mean(average_precisions)

    def _preds_eval(k_values, test_data_w_neg_samples, ground_truth, pred, model, results_file):
        edge_index = test_data_w_neg_samples['user', 'rates', 'item'].edge_label_index
        user_ids = np.unique(edge_index[0].numpy())

        ## Tested shuffle below but did not have any effect ##
        # permutation = np.random.permutation(edge_index.shape[1])
        # edge_index_shuffled = edge_index[:, permutation]
        # ground_truth_shuffled = ground_truth[permutation]
        # pred_shuffled = pred[permutation]

        with open(results_file, 'a') as file:  # 'a' for append, 'w' for write
            file.write(f"Results for {model}\n")

            for k in k_values:
                hit_at_k = average_hit_at_k(k, ground_truth, pred, user_ids, edge_index, model)
                file.write(f"HIT@{k}: {hit_at_k}\n")
                print(f"HIT@{k}: {hit_at_k}")

            for k in k_values:
                map_at_k = mean_ap_at_k(k, ground_truth, pred, user_ids, edge_index)
                file.write(f"MAP@{k}: {map_at_k}\n")
                print(f"MAP@{k}: {map_at_k}")

            for k in k_values:
                ndcg_result = calculate_ndcg_at_k(k, ground_truth, pred, user_ids, edge_index)
                file.write(f"NDCG@{k}: {ndcg_result}\n")
                print(f"NDCG@{k}: {ndcg_result}")

    # eval_loader = {
    #     'pop': {
    #        'ground_truth': ground_truth_pop,
    #        'pred': pred_pop
    #     },
    #     'cf_mf': {
    #        'ground_truth': ground_truth_cf_mf,
    #        'pred': pred_cf_mf
    #     },
    #     'hybrid_mf_clustering': {
    #         'ground_truth': ground_truth_hybrid_mf_clustering,
    #         'pred': pred_hybrid_mf_clustering
    #     },
    #     'hybrid_mf_sbert': {
    #        'ground_truth': ground_truth_hybrid_mf_sbert,
    #        'pred': pred_hybrid_mf_sbert
    #     },
    #     'cf_gnn': {
    #        'ground_truth': ground_truth_cf_gnn,
    #        'pred': pred_cf_gnn
    #     },
    #     'hybrid_gnn_clustering': {
    #         'ground_truth': ground_truth_hybrid_gnn_clustering,
    #         'pred': pred_hybrid_gnn_clustering
    #     },
    #     'hybrid_gnn_tfidf': {
    #         'ground_truth': ground_truth_hybrid_gnn_tfidf,
    #         'pred': pred_hybrid_gnn_tfidf
    #     },
    #     'hybrid_gnn_sbert': {
    #         'ground_truth': ground_truth_hybrid_gnn_sbert,
    #         'pred': pred_hybrid_gnn_sbert
    #     },

    # }

    #TODO: add movielens
    model_variants = ['pop', 'cf_mf', 'hybrid_mf_clustering', 'hybrid_mf_sbert', 'cf_gnn', 'hybrid_gnn_clustering', 'hybrid_gnn_tfidf', 'hybrid_gnn_sbert']
    results_file = 'evaluation_results.txt'
    with open(results_file, 'a') as file:
        file.write(f"Fold counter: {fold_counter}\n")

    for model in model_variants:
        k_values = [1, 5, 10, 15, 20] if mode != 'debug' else [1, 5, 10, 15, 20]
        _preds_eval(k_values, test_data_w_neg_samples, ground_truth=preds[model][f'ground_truth_{model}'], pred=preds[model][f'pred_{model}'], model=model, results_file=results_file)

In [13]:
# MAIN ########## TRAIN TEST GENERAION ############
# data, train_data, test_data = train_test_generator(unique_user_id, item_feat_zeros, edge_index_user_to_item)
# if dataset_mode == 'movie': model_to_item_feat_dict['hybrid_gnn_movielens'] #TODO: implemmet for movieLens

if dataset_mode == 'contract':
    model_to_item_feat_dict = {
        'cf_gnn': item_feats_dict['item_feat_zeros'],
        'hybrid_gnn_clustering': item_feats_dict['item_feat_clustering'],
        'hybrid_gnn_tfidf': item_feats_dict['item_feat_tfidf'],
        'hybrid_gnn_sbert': item_feats_dict['item_feat_sbert'],
    }
else:
    model_to_item_feat_dict = {
        'cf_gnn_movielens': item_feats_dict['item_feat_zeros'],
        'hybrid_gnn_movielens': item_feats_dict['item_feat_movielens'],
    }


data, train_data, test_data, folds = custom_train_test_generator(unique_user_id, unique_item_id, item_feats_dict['item_feat_zeros'], edge_index_user_to_item)

fold_counter = 1
for fold in folds:
    train_data["user", "rates", "item"].edge_index = fold['remaining_train_edges']
    train_data["user", "rates", "item"].edge_label_index = fold['disjoint_train_edges']
    train_data["user", "rates", "item"].edge_label = torch.ones(len(train_data["user", "rates", "item"].edge_label_index[0]))

    test_data["user", "rates", "item"].edge_index = fold['train_edges'] # TODO: check the correctness
    test_data["user", "rates", "item"].edge_label_index = fold['test_edges']
    test_data["user", "rates", "item"].edge_label = torch.ones(len(test_data["user", "rates", "item"].edge_label_index[0]))

    train_data = T.ToUndirected()(train_data)
    test_data = T.ToUndirected()(test_data)
    data = T.ToUndirected()(data) # Note: not sure why we need to have this here

    models = gnn_train(dataset_mode, data, train_data, item_feats_dict)
    test_data_w_neg_samples = add_neg_samples(data, test_data)
    preds = gnn_pred(models, test_data_w_neg_samples, model_to_item_feat_dict)
    train_df_mf, test_df_mf = mf_data_prep(test_data_w_neg_samples, train_data, unique_item_id)
    preds = pop_pred(preds, train_df_mf, test_df_mf)
    preds = mf_pred(preds, train_df_mf, test_df_mf, item_feat_sbert_df)
    preds_eval(test_data_w_neg_samples, preds, fold_counter)
    fold_counter += 1


#SOCIAL EDGES ABLEATION EXPERIMENT#
# def add_social_edges(edge_index_user_to_item, unique_user_id, unique_item_id, items_ratings_df, item_feat):
#     unique_item_id = unique_item_id.copy()
#     # Define the filename where the data will be saved
#     filename = 'dataset/saved_social_edges_100k.pkl'

#     # Check if the file exists
#     if os.path.exists(filename):
#         # If it does, load the data and return it
#         with open(filename, 'rb') as f:
#             unique_user_id, unique_item_id_w_users, edge_index_user_to_item, item_feat = pickle.load(f)
#         print('Data loaded from file')
#     else:
#         user_transactions_df = pd.read_parquet('dataset/user_transactions.parquet')
#         contract_addresses = pd.read_parquet('dataset/contract_addresses.parquet')
#         contract_set = set(contract_addresses['address'])

#         # Shifting item_ids
#         edge_index_user_to_item[1] = edge_index_user_to_item[1] + len(edge_index_user_to_item[0].unique())
#         unique_item_id['mappedID'] = unique_item_id['mappedID'] + len(edge_index_user_to_item[0].unique())
#         #Adding user_ids to item_ids since now users can be an item too #TODO if the GNN performance turned to be bad, just add 'to' user addresses to both item_feat and unique_item_ids
#         unique_item_id_w_users = pd.concat([unique_user_id.rename(columns={'userId': 'entityId'}), unique_item_id.rename(columns={'itemId': 'entityId'})], axis=0)
#         user_feat = torch.zeros((len(edge_index_user_to_item[0].unique()), item_feat.shape[1]))
#         item_feat= torch.cat([user_feat, item_feat], dim=0) # Why don't we adding item_feat to user_feat?

#         # unique_item_id_w_users['type'] = 'user' or 'item'

#         users = items_ratings_df['userId'].unique()

#         print('edge index shape before adding social edges:', edge_index_user_to_item.shape)
#         count = 0
#         #note there is a 200k constraint, delete it
#         for i, interaction in tqdm(user_transactions_df.iterrows(), total=len(user_transactions_df)):
#             if interaction['from'] not in contract_set and interaction['to'] not in contract_set and interaction['from'] in users and interaction['to'] in users:
#                 if interaction['from'] == interaction['to']: continue
#                 from_user_id = unique_item_id_w_users[unique_item_id_w_users['entityId'] == interaction['from']]['mappedID'].iloc[0]
#                 to_user_id = unique_item_id_w_users[unique_item_id_w_users['entityId'] == interaction['to']]['mappedID'].iloc[0]
#                 social_edge = torch.tensor([[from_user_id], 
#                                             [to_user_id]], dtype=torch.int64)
#                 edge_index_user_to_item = torch.cat([edge_index_user_to_item, social_edge], dim=1)
#                 # count += 1
#                 # if count % 5 == 0: break
#         print('edge index shape after adding social edges:', edge_index_user_to_item.shape)
#         del user_transactions_df
#         del contract_addresses
#         del contract_set
#         import gc
#         gc.collect()

#         #uncomment below
#         with open(filename, 'wb') as f:
#             pickle.dump((unique_user_id, unique_item_id_w_users, edge_index_user_to_item, item_feat), f)
#         print('social edges saved to dataset/saved_social_edges_100k.pkl')
    
#     return unique_user_id, unique_item_id_w_users, edge_index_user_to_item, item_feat
# if experiment == 'add_social_edges':
#     #TODO: if wanna use the social edges, need to pass the arguments
#     unique_user_id_w_social, unique_item_id_w_social, edge_index_user_to_item_w_social, item_feat_w_social = add_social_edges(edge_index_user_to_item, unique_user_id, unique_item_id, items_ratings_df, item_feat)
#     data, train_data, test_data = _train_test_generator(unique_user_id_w_social, item_feat_w_social, edge_index_user_to_item_w_social)


1
2
3
Device: 'cuda'


  0%|          | 0/188 [00:00<?, ?it/s]

In [None]:
############# DIVERSITY EXPERIMENT ##############
model_variants = ['mfn', 'mfc', 'pop']
eval_loader = {
    # 'gnn': {
    #     'ground_truth': ground_truth_gnn,
    #     'pred': pred_gnn
    # },
    'pop': {
        'ground_truth': ground_truth_pop,
        'pred': pred_pop
    },
    'mfn': {
        'ground_truth': ground_truth_mfn,
        'pred': pred_mfn
    },
    'mfc': {
        'ground_truth': ground_truth_mfc,
        'pred': pred_mfc
    },

}
for model_variant_eval in model_variants:
    k_values = [1, 5, 10, 15, 20] if mode != 'debug' else [1, 5, 10, 15, 20]
if experiment == 'diversity':
    edge_index = test_data_all2all['user', 'rates', 'item'].edge_label_index
    user_ids = np.unique(edge_index[0].numpy())

    for model_variant_eval in model_variants:
        pred = eval_loader[model_variant_eval]['pred']
        ground_truth = eval_loader[model_variant_eval]['ground_truth']

        for k in k_values:
            recs_list = set()
            for user_id in user_ids: # tqdm(user_ids, total=len(user_ids)):
                mask = edge_index[0] == user_id
                filtered_pred = pred[mask]
                filtered_items = edge_index[1][mask]
                sorted_indices = np.argsort(filtered_pred)[::-1]
                top_k_indices = sorted_indices[:k]
                top_k_indices = top_k_indices.copy()
                top_k_items = filtered_items[top_k_indices].numpy()
                recs_list.update(top_k_items)

            diversity_at_k = len(recs_list) / len(np.unique(edge_index[1].numpy()))
            print(f'Item coverage diversity for {model_variant_eval} @{k}:', diversity_at_k)
        
        for k in k_values:
            users_with_relevant_recs = set()
            
            for user_id in user_ids: # tqdm(user_ids, total=len(user_ids)):
                mask = edge_index[0] == user_id
                filtered_pred = pred[mask]
                sorted_indices = np.argsort(filtered_pred)[::-1]
                top_k_indices = sorted_indices[:k]
                filtered_ground_truth = ground_truth[mask] 
                relevant_recs = filtered_ground_truth[top_k_indices] 
                
                if np.sum(relevant_recs) > 0:  # At least one relevant recommendation
                    users_with_relevant_recs.add(user_id)
            
            user_coverage_at_k = len(users_with_relevant_recs) / len(user_ids)
            print(f'User coverage for {model_variant_eval} @{k}:', user_coverage_at_k)

    #######  Intra-List Diversity #######
    # TODO: Based on item_feat define the compute_dissimilarity method
    # for k in k_values:
    #     avg_dissimilarity = []
        
    #     for user_id in tqdm(user_ids, total=len(user_ids)):
    #         mask = edge_index[0] == user_id
    #         filtered_pred = pred[mask]
    #         filtered_items = edge_index[1][mask]
    #         sorted_indices = np.argsort(filtered_pred)[::-1]
    #         top_k_indices = sorted_indices[:k]
    #         top_k_items = filtered_items[top_k_indices].numpy()
            
    #         dissimilarity_sum = 0
    #         for i in range(len(top_k_items)):
    #             for j in range(i+1, len(top_k_items)):
    #                 dissimilarity_sum += compute_dissimilarity(top_k_items[i], top_k_items[j])
            
    #         if k > 1:
    #             avg_pairwise_dissimilarity = 2 * dissimilarity_sum / (k * (k - 1))
    #             avg_dissimilarity.append(avg_pairwise_dissimilarity)
        
    #     intra_list_diversity_at_k = np.mean(avg_dissimilarity)
    #     print(f'Intra-list diversity for {model_mode_eval} @{k}:', intra_list_diversity_at_k)

'''
Item coverage diversity for gnn @1: 0.1974024375230826
Item coverage diversity for gnn @5: 0.6526529607287948
Item coverage diversity for gnn @10: 0.9267512002954573
Item coverage diversity for gnn @15: 0.9996922319340146
Item coverage diversity for gnn @20: 1.0
User coverage for gnn @1: 0.5559827456864216
User coverage for gnn @5: 0.6452550637659414
User coverage for gnn @10: 0.6614778694673669
User coverage for gnn @15: 0.6639159789947486
User coverage for gnn @20: 0.6641035258814704
Item coverage diversity for mfn @1: 0.18576880462883172
Item coverage diversity for mfn @5: 0.5387172227009726
Item coverage diversity for mfn @10: 0.9447864089622061
Item coverage diversity for mfn @15: 0.9996922319340146
Item coverage diversity for mfn @20: 0.999938446386803
User coverage for mfn @1: 0.5041260315078769
User coverage for mfn @5: 0.5982745686421606
User coverage for mfn @10: 0.6395348837209303
User coverage for mfn @15: 0.6611965491372843
User coverage for mfn @20: 0.6641035258814704
'''
