In [None]:
####### INSTALATION #######

!pip uninstall torch -y
!pip install torch==1.13.1
# !pip uninstall torch-scatter -y
# !pip uninstall torch-sparse -y
# !pip uninstall pyg-lib -y
# !pip uninstall git+https://github.com/pyg-team/pytorch_geometric.git -y
# !pip uninstall sentence_transformers -y

import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip uninstall torch-scatter torch-sparse torch-geometric torch-cluster  --y
!pip install torch-scatter -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install pyg-lib -f https://data.pyg.org/whl/nightly/torch-${TORCH}.html
!pip install git+https://github.com/pyg-team/pytorch_geometric.git
!pip install pandas
!pip install matplotlib
!pip install pyarrow fastparquet
!pip install transformers
# !pip install torch-sparse -f https://data.pyg.org/whl/torch-{torch.__version__}.html
# !pip install torch-cluster -f https://data.pyg.org/whl/torch-{torch.__version__}.html
# !pip install sentence_transformers==0.1.0

In [26]:
###### IMPORT #######
import numpy as np
import pandas as pd
import random
# from neo4j import GraphDatabase
from torch_geometric.data import Data
import torch
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
from tqdm import tqdm
from collections import defaultdict
import json
import multiprocessing
import matplotlib.pyplot as plt

from sklearn import preprocessing, feature_extraction, model_selection
from sklearn.metrics import mean_absolute_error, mean_squared_error
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
from torch_geometric.loader import LinkNeighborLoader
from torch_geometric.nn import SAGEConv, to_hetero
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score
import copy
import re
from transformers import AutoTokenizer, AutoModel
# from sentence_transformers import SentenceTransformer

In [80]:
possible_experiments = ['none', 'diversity', 'ucsp', 'icsp', 'usparcity', 'isparcity', 'sBERT', 'TFIDF', 'ablation_item_feat', 'ablation_social_edges']
possible_experiments = {
    0: 'none',
    1: 'diversity',
    2: 'ucsp',
    3: 'icsp',
    4: 'usparcity',
    5: 'isparcity',
    6: 'sBERT',
    7: 'TFIDF',
    8: 'ablation_item_feat', 
    9: 'ablation_social_edges', # meaning adding social relationships
}
possible_modes = ['debug', 'experiment']
model_variants = ['gnn', 'nmf', 'cmf']

experiment = possible_experiments[9]
mode = possible_modes[0]
model_variant_eval = model_variants[0]



In [84]:
#### DATA LOADER ####
from torch_geometric.data import download_url, extract_zip
from torch import Tensor

def data_loader(ratings_df):
    unique_user_id = ratings_df['userId'].unique()
    unique_user_id = pd.DataFrame(data={
        'userId': unique_user_id,
        'mappedID': pd.RangeIndex(len(unique_user_id)),
    })
    # print("Mapping of user IDs to consecutive values:")
    # print("==========================================")
    # print(unique_user_id.head())

    unique_item_id = ratings_df['itemId'].unique()
    unique_item_id = pd.DataFrame(data={
        'itemId': unique_item_id,
        'mappedID': pd.RangeIndex(len(unique_item_id)),
    })
    # print("Mapping of item IDs to consecutive values:")
    # print("===========================================")
    # print(unique_item_id.head())

    ratings_user_id = pd.merge(ratings_df['userId'], unique_user_id,
                                left_on='userId', right_on='userId', how='left')
    ratings_user_id = torch.from_numpy(ratings_user_id['mappedID'].values)
    ratings_item_id = pd.merge(ratings_df['itemId'], unique_item_id,
                                left_on='itemId', right_on='itemId', how='left')
    ratings_item_id = torch.from_numpy(ratings_item_id['mappedID'].values)
    edge_index_user_to_item = torch.stack([ratings_user_id, ratings_item_id], dim=0)
    # print()
    # print("Final edge indices pointing from users to items:")
    # print("=================================================")
    # print(edge_index_user_to_item)
    return unique_user_id, unique_item_id, edge_index_user_to_item

def movie_loader():
    url = 'https://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
    extract_zip(download_url(url, '.'), '.')
    movies_path = './ml-latest-small/movies.csv'
    ratings_path = './ml-latest-small/ratings.csv'
    items_ratings_df = pd.read_csv(ratings_path)
    items_ratings_df = items_ratings_df.rename(columns={'movieId': 'itemId'})
    unique_user_id, unique_item_id, edge_index_user_to_item = data_loader(items_ratings_df)
    items_df = pd.read_csv(movies_path)
    items_df = items_df.rename(columns={'movieId': 'itemId', 'title': 'name'})
    items_df = pd.merge(items_df, unique_item_id, on='itemId', how='left')
    items_df = items_df.sort_values('mappedID') # (Just the last 20 movies have NaN mappedId)
    genres = items_df['genres'].str.get_dummies('|')
    print(genres[["Action", "Adventure", "Drama", "Horror"]].head())
    item_feat = torch.from_numpy(genres.values).to(torch.float)
    assert item_feat.size() == (9742, 20)  # 20 genres in total.
    return unique_user_id, unique_item_id, edge_index_user_to_item, items_df, item_feat, items_ratings_df

def contract_loader():
    items_ratings_df = pd.read_parquet('dataset/user_contract_rating.parquet')
    def calculate_sparcity_value(df):
        num_users = df['user'].nunique()
        num_items = df['item'].nunique()
        num_interactions = len(df)
        total_possible_interactions = num_users * num_items / 100
        sparsity = 1 - (num_interactions / total_possible_interactions)
        return sparsity
    
    def filter_interactions(df, column, k):
        valid_entries = df[column].value_counts()
        valid_entries = valid_entries[valid_entries > k]
        df = df[df[column].isin(valid_entries.index)]
        print(f'{column} sparcity value is:', calculate_sparcity_value(df))
        return df

    ########## SPARCITY EXPERIMENT ###########
    if experiment == 'usparcity':
        k = 5
        items_ratings_df = filter_interactions(items_ratings_df, 'user', k)
    elif experiment == 'isparcity':
        k = 5
        items_ratings_df = filter_interactions(items_ratings_df, 'item', k)

    items_ratings_df = items_ratings_df[:100000] if mode == 'debug' else items_ratings_df
    items_df = {}
    items_df['name'] = items_ratings_df['item'].unique()
    items_df['itemId'], unique_names = pd.factorize(items_df['name'])
    # items_df['itemId'] = items_df['itemId'] + 1 #TODO test commenting this line didn't breal anything
    items_df = pd.DataFrame(items_df, columns=['itemId', 'name'])

    def get_item_feat_sbert(items_df):
        contract2comments = pd.read_parquet('dataset/contracts2comment.parquet')
        c2c_main_class = contract2comments[contract2comments['contract_name'] == contract2comments['class_name']]

        def reorder_text(text):
            lines = text.split("\n")
            notice_lines = [line for line in lines if "@notice" in line]
            other_lines = [line for line in lines if "@notice" not in line]
            reorderd_text = "\n".join(notice_lines + other_lines)
            return reorderd_text

        def preprocess_text(text):
            text = reorder_text(text)
            text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
            # Remove special characters, numbers, etc.
            text = re.sub(r'\W', ' ', text)
            # Remove extra spaces
            text = re.sub(r'\s+', ' ', text).strip()
            text = text[:512] if len(text) > 512 else text
            return text

        sentences = []
        for i, item in items_df.iterrows():
            comment_class = c2c_main_class[c2c_main_class['contract_name'] == item['name']]
            if not comment_class.empty and comment_class['class_documentation'].iloc[0] != '':
                sentences.append(comment_class['class_documentation'].iloc[0])
            else:
                class_names = contract2comments[contract2comments['contract_name'] == item['name']]['class_name']
                sentences.append(' '.join(class_names))

        preprocessed_sentences = [preprocess_text(sentence) for sentence in sentences]
        tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/bert-base-nli-mean-tokens")
        model = AutoModel.from_pretrained("sentence-transformers/bert-base-nli-mean-tokens")
        device = torch.device("cpu") #"cuda" if torch.cuda.is_available() else "cpu") # NOT enough GPU memory
        model = model.to(device)
        inputs = tokenizer(preprocessed_sentences, padding=True, truncation=True, return_tensors="pt", max_length=512)
        inputs = {key: tensor.to(device) for key, tensor in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1)
        item_feat = embeddings
        # model = SentenceTransformer('sentence-transformers/distilbert-base-nli-mean-tokens')
        # embeddings = model.encode(preprocessed_sentences)
        
        return item_feat
    
    def get_item_feat_tfidf(items_df):
        contract_top_words_df = pd.read_parquet('dataset/contract_top_words.parquet')
        contract_top_words_df = contract_top_words_df.rename(columns={'contract_name': 'name'})
        contracts_df_top_words = items_df.merge(contract_top_words_df, on='name', how='left')
        contracts_df_top_words['keywords'] = contracts_df_top_words['keywords'].fillna('')
        items_df = contracts_df_top_words
        items_df.set_index('itemId', inplace=True)
        # f =5 # ratio to determine the number of top keywords selected for each contract to construct item_feat
        items_df['truncated_keywords'] = items_df['keywords'].apply(lambda x: ','.join(x.split(',')))
        X_df = items_df['truncated_keywords'].str.get_dummies(',')
        item_feat = torch.from_numpy(X_df.values).to(torch.float)
        return item_feat
    
    ########### SBERT EXPERIMENT ###########
    if experiment == 'sBERT':
        item_feat = get_item_feat_sbert(items_df)
    else:
        item_feat = get_item_feat_tfidf(items_df)

    print('item feature tensor shape', item_feat.shape)
    items_ratings_df = items_ratings_df.rename(columns={'user': 'userId', 'item': 'itemId'})
    unique_user_id, unique_item_id, edge_index_user_to_item = data_loader(items_ratings_df)
    print('number of unique users', len(unique_user_id))
    print('number of unique items', len(unique_item_id))
    return unique_user_id, unique_item_id, edge_index_user_to_item, items_df, item_feat, items_ratings_df

universal_mode = 'contract'
loaders = {
    'contract_loader': contract_loader,
    'movie_loader': movie_loader,
}
unique_user_id, unique_item_id, edge_index_user_to_item, items_df, item_feat, items_ratings_df = loaders[f'{universal_mode}_loader']()


item feature tensor shape torch.Size([17310, 8023])
number of unique users 10761
number of unique items 17310


In [85]:
######### LINK BINARY PRED MODEL ##########
def train_test_generator(unique_user_id, item_feat, edge_index_user_to_item):  
    data = HeteroData()
    data["user"].node_id = torch.arange(len(unique_user_id))
    data["item"].node_id = torch.arange(item_feat.shape[0])
    data["item"].x = item_feat
    data["user", "rates", "item"].edge_index = edge_index_user_to_item
    data = T.ToUndirected()(data)

    transform = T.RandomLinkSplit(
        num_val=0,
        num_test=0.2,
        disjoint_train_ratio=0.3,
        neg_sampling_ratio=0,
        add_negative_train_samples=False,
        edge_types=("user", "rates", "item"),
        rev_edge_types=("item", "rev_rates", "user"), 
    )
    
    train_data, val_data, test_data = transform(data)
    return data, train_data, test_data

def GNN_recommender(data, train_data):

    # Define seed edges:
    edge_label_index = train_data["user", "rates", "item"].edge_label_index
    edge_label = train_data["user", "rates", "item"].edge_label
    train_loader = LinkNeighborLoader(
        data=train_data,
        num_neighbors=[20, 10],
        neg_sampling_ratio=2.0,
        edge_label_index=(("user", "rates", "item"), edge_label_index),
        edge_label=edge_label,
        batch_size=128,
        shuffle=True,
    )
    print('train loader:', type(train_loader))

    class GNN(torch.nn.Module):
        def __init__(self, hidden_channels):
            super().__init__()
            self.conv1 = SAGEConv(hidden_channels, hidden_channels)
            self.conv2 = SAGEConv(hidden_channels, hidden_channels)
        def forward(self, x: Tensor, edge_index: Tensor) -> Tensor:
            x = F.relu(self.conv1(x, edge_index))
            x = self.conv2(x, edge_index)
            return x
    # Our final classifier applies the dot-product between source and destination
    # node embeddings to derive edge-level predictions:
    class Classifier(torch.nn.Module):
        def forward(self, x_user: Tensor, x_item: Tensor, edge_label_index: Tensor) -> Tensor:
            edge_feat_user = x_user[edge_label_index[0]] # Convert node embeddings to edge-level representations:
            edge_feat_item = x_item[edge_label_index[1]]
            scores = (edge_feat_user * edge_feat_item).sum(dim=-1)
            return scores # Apply dot-product to get a prediction per supervision edge:
        
    class Model(torch.nn.Module):
        def __init__(self, hidden_channels):
            super().__init__()
            # Since the dataset does not come with rich features, we also learn two
            # embedding matrices for users and items:
            self.item_lin = torch.nn.Linear(item_feat.shape[1], hidden_channels)
            self.user_emb = torch.nn.Embedding(data["user"].num_nodes, hidden_channels)
            self.item_emb = torch.nn.Embedding(data["item"].num_nodes, hidden_channels)
            # Instantiate homogeneous GNN:
            self.gnn = GNN(hidden_channels)
            # Convert GNN model into a heterogeneous variant:
            self.gnn = to_hetero(self.gnn, metadata=data.metadata())
            self.classifier = Classifier()

        def forward(self, data: HeteroData) -> Tensor:
            x_dict = {
            "user": self.user_emb(data["user"].node_id),
            "item": self.item_lin(data["item"].x) + self.item_emb(data["item"].node_id),
            } 
            # `x_dict` holds feature matrices of all node types
            # `edge_index_dict` holds all edge indices of all edge types
            x_dict = self.gnn(x_dict, data.edge_index_dict)
            pred = self.classifier(
                x_dict["user"],
                x_dict["item"],
                data["user", "rates", "item"].edge_label_index,
            )
            return pred
            
    ########## TRAINING ##########
    model = Model(hidden_channels=64)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Device: '{device}'")
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    for epoch in range(1, 10):
        total_loss = total_examples = 0
        for sampled_data in tqdm(train_loader):
            optimizer.zero_grad()
            sampled_data.to(device)
            pred = model(sampled_data)
            ground_truth = sampled_data["user", "rates", "item"].edge_label
            loss = F.binary_cross_entropy_with_logits(pred, ground_truth)
            loss.backward()
            optimizer.step()
            total_loss += float(loss) * pred.numel()
            total_examples += pred.numel()

        # TODO: Add the val_loader, keep the best model
        print(f"Epoch: {epoch:03d}, Loss: {total_loss / total_examples:.4f}")

    ########## AUC EVAL VALIDATION #########
    # edge_label_index = val_data["user", "rates", "item"].edge_label_index
    # edge_label = val_data["user", "rates", "item"].edge_label
    # # val_data has neg samples in it
    # val_loader = LinkNeighborLoader(
    #     data=val_data,
    #     num_neighbors=[20, 10],
    #     edge_label_index=(("user", "rates", "item"), edge_label_index),
    #     edge_label=edge_label,
    #     batch_size=3 * 128,
    #     shuffle=False,
    # )
    # sampled_data = next(iter(val_loader))
    # preds = []
    # ground_truths = []
    # for sampled_data in tqdm(val_loader):
    #     with torch.no_grad():
    #         sampled_data.to(device)
    #         preds.append(model(sampled_data))
    #         ground_truths.append(sampled_data["user", "rates", "item"].edge_label)
    # pred = torch.cat(preds, dim=0).cpu().numpy()
    # ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()
    # auc = roc_auc_score(ground_truth, pred)
    # print()
    # print(f"Validation AUC: {auc:.4f}")
    # return data, train_data, val_data, train_loader, val_loader, ground_truth, pred, test_data, model
    return model

In [86]:
########## TRAIN TEST GENERAION ############

####### ITEM FEAT ABLATION EXPRIMENT ####### 
if experiment == 'ablation_item_feat':
    item_feat = torch.zeros_like(item_feat)

# ####### SOCIAL EDGES ABLEATION EXPERIMENT #######
def add_social_edges(edge_index_user_to_item, unique_item_id, items_ratings_df, item_feat):
    user_transactions_df = pd.read_csv('dataset/user_transactions.csv')
    contract_addresses = pd.read_csv('dataset/contract_addresses.csv')
    contract_set = set(contract_addresses['address'])
    
    edge_index_user_to_item[1] = edge_index_user_to_item[1] + len(edge_index_user_to_item[0].unique())
    unique_item_id['mappedID'] = unique_item_id['mappedID'] + len(edge_index_user_to_item[0].unique())
    user_feat = torch.zeros((len(edge_index_user_to_item[0].unique()), item_feat.shape[1]))
    item_feat= torch.cat([item_feat, user_feat], dim=0)

    unique_user_id['type'] = 'user'
    unique_item_id['type'] = 'item'
    node2id = pd.concat([
        unique_item_id.rename(columns={'itemId': 'entityId'}),
        unique_user_id.rename(columns={'userId': 'entityId'})
    ], ignore_index=True)

    users = items_ratings_df['userId'].unique()

    print('edge index shape before adding social edges:', edge_index_user_to_item.shape)
    for i, interaction in tqdm(user_transactions_df.iterrows(), total=len(user_transactions_df)):
        if interaction['from'] not in contract_set and interaction['to'] not in contract_set and interaction['from'] in users and  interaction['to'] in users:
            from_user_id = node2id[node2id['entityId'] == interaction['from']]['mappedID'].iloc[0]
            to_user_id = node2id[node2id['entityId'] == interaction['to']]['mappedID'].iloc[0]
            social_edge = torch.tensor([[from_user_id], 
                                        [to_user_id]], dtype=torch.int64)
            edge_index_user_to_item = torch.cat([edge_index_user_to_item, social_edge], dim=1)
    print('edge index shape after adding social edges:', edge_index_user_to_item.shape)
    
    return unique_user_id, unique_item_id, edge_index_user_to_item, item_feat
if experiment == 'ablation_social_edges':
    unique_user_id, unique_item_id, edge_index_user_to_item, item_feat = add_social_edges(edge_index_user_to_item, unique_item_id, items_ratings_df, item_feat)


data, train_data, test_data = train_test_generator(unique_user_id, item_feat, edge_index_user_to_item)

edge index shape before adding social edges: torch.Size([2, 100000])


100%|██████████| 1738651/1738651 [09:05<00:00, 3188.78it/s]


edge index shape after adding social edges: torch.Size([2, 107559])


In [None]:
####### CSP EXPRIMENTS #######
### CSP #### note: if the ratio==1, rerun from the first step
if experiment == 'ucsp' or experiment == 'icsp':
    def csp_test_gen(train_data, test_data, unique_data, entity_index, experiment_abbr):
        train_data_unique_entities = set(train_data['user', 'rates', 'item'].edge_label_index[entity_index].unique().numpy())
        unique_entities = set(unique_data['mappedID'].unique())
        entities_not_in_train = unique_entities - train_data_unique_entities
        mask = torch.tensor([entity in entities_not_in_train for entity in test_data["user", "rates", "item"].edge_label_index[entity_index].numpy()])
        
        test_data_filtered = copy.deepcopy(test_data)
        test_data_filtered["user", "rates", "item"].edge_label_index = test_data_filtered["user", "rates", "item"].edge_label_index[:, mask]
        test_data_filtered["user", "rates", "item"].edge_label = test_data_filtered["user", "rates", "item"].edge_label[mask]
        
        ratio = len(test_data_filtered["user", "rates", "item"].edge_label_index[entity_index]) / len(test_data["user", "rates", "item"].edge_label_index[entity_index])
        print(f'test to train ratio {experiment_abbr}', ratio)
        
        return test_data_filtered, ratio

    test_data_csp, test_to_train_ratio_csp = csp_test_gen(
        train_data, test_data, unique_user_id, 0 if experiment == 'ucsp' else 1, 'CSP-user' if experiment == 'ucsp' else 'CSP-item'
    )

In [76]:
######## ALL_TO_ALL USER_ITEM PAIRS GENERATOR IN TEST_DATA #########

# If mode GNN run below
### SLICING TEST_DATA FOR ALL_TO_ALL EVAL ###
slice_rate = 0.1
if experiment == 'ucsp' or experiment == 'icsp': 
    slice_rate = 1
    test_data = test_data_csp

test_data["user", "rates", "item"].edge_label_index = test_data["user", "rates", "item"].edge_label_index[:, : int(slice_rate * len(test_data["user", "rates", "item"].edge_label_index[0]))]
test_data["user", "rates", "item"].edge_label = test_data["user", "rates", "item"].edge_label[ : int(slice_rate * len(test_data["user", "rates", "item"].edge_label))]

edge_index_test = set(zip(test_data["user", "rates", "item"].edge_label_index[0].numpy(), test_data["user", "rates", "item"].edge_label_index[1].numpy()))

all_users = test_data["user", "rates", "item"].edge_label_index[0].unique().numpy()
all_items = test_data["user", "rates", "item"].edge_label_index[1].unique().numpy()

# which elp the model most: keep the social_edges in test and be evaluated or remove all social_edges in test_set?
if experiment == 'ablation_social_edges':
    print(len(all_items))
    all_items = [item for item in test_data["user", "rates", "item"].edge_label_index[1].unique().numpy() if item > len(all_users)]
    print(len(all_items))

new_edges = []
new_labels = []

#TODO instead of all possible pairs, we can continue for each user if it reaches to x samples (pos + neg)
for user_id in all_users:
    for item_id in all_items:
        if (user_id, item_id) not in edge_index_test:
            new_edges.append((user_id, item_id))
            new_labels.append(0)

import copy
test_data_all2all = copy.deepcopy(test_data)

if new_edges:
    new_edges_tensor = torch.tensor(new_edges, dtype=torch.int64).t().contiguous()
    new_labels_tensor = torch.tensor(new_labels, dtype=torch.int64)

    test_data_all2all["user", "rates", "item"].edge_label_index = torch.cat((test_data_all2all["user", "rates", "item"].edge_label_index, new_edges_tensor), dim=1)
    test_data_all2all["user", "rates", "item"].edge_label = torch.cat((test_data_all2all["user", "rates", "item"].edge_label, new_labels_tensor), dim=0)

print('test edges shape BEFORE adding all possible user item pairs', test_data["user", "rates", "item"].edge_label_index.shape)
print('test edges shape AFTER adding all possible user item pairs', test_data_all2all["user", "rates", "item"].edge_label_index.shape)

print('unique test users', len(test_data_all2all["user", "rates", "item"].edge_label_index[0].unique()))
print('unique test items', len(test_data_all2all["user", "rates", "item"].edge_label_index[1].unique()))



test edges shape BEFORE adding all possible user item pairs torch.Size([2, 2000])
test edges shape AFTER adding all possible user item pairs torch.Size([2, 2108459])
unique test users 1699
unique test items 1241


In [77]:
########## GNN TRAINING ############
#if model_mode == GNN run below
model = GNN_recommender(data, train_data)


train loader: <class 'torch_geometric.loader.link_neighbor_loader.LinkNeighborLoader'>
Device: 'cuda'


100%|██████████| 188/188 [00:19<00:00,  9.83it/s]


Epoch: 001, Loss: 0.4496


100%|██████████| 188/188 [00:18<00:00, 10.44it/s]


Epoch: 002, Loss: 0.3318


100%|██████████| 188/188 [00:18<00:00, 10.38it/s]


Epoch: 003, Loss: 0.2838


100%|██████████| 188/188 [00:17<00:00, 10.56it/s]


Epoch: 004, Loss: 0.2519


100%|██████████| 188/188 [00:19<00:00,  9.61it/s]


Epoch: 005, Loss: 0.2274


100%|██████████| 188/188 [00:19<00:00,  9.70it/s]


Epoch: 006, Loss: 0.2043


100%|██████████| 188/188 [00:19<00:00,  9.46it/s]


Epoch: 007, Loss: 0.1900


100%|██████████| 188/188 [00:19<00:00,  9.51it/s]


Epoch: 008, Loss: 0.1740


100%|██████████| 188/188 [00:19<00:00,  9.54it/s]

Epoch: 009, Loss: 0.1599





In [78]:
######## GNN PRED FOR TEST_DATA_PRIME ######### for CSP chnage the TEST_DATA_PRIME
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
test_loader = LinkNeighborLoader(
    data=test_data_all2all,
    num_neighbors=[20, 10],
    edge_label_index=(("user", "rates", "item"), test_data_all2all["user", "rates", "item"].edge_label_index),
    edge_label=test_data_all2all["user", "rates", "item"].edge_label,
    batch_size=3 * 128,
    shuffle=False,
)
sampled_data = next(iter(test_loader))
preds = []
ground_truths = []
for sampled_data in tqdm(test_loader):
    with torch.no_grad():
        sampled_data.to(device)
        preds.append(model(sampled_data))
        ground_truths.append(sampled_data["user", "rates", "item"].edge_label)
pred_gnn = torch.cat(preds, dim=0).cpu().numpy()
ground_truth_gnn = torch.cat(ground_truths, dim=0).cpu().numpy()

print('all ground truth len', len(ground_truth_gnn))

100%|██████████| 5491/5491 [08:39<00:00, 10.57it/s]

all ground truth len 2108459





In [183]:
########### DATA PREPRATION FOR MF MODELS #############

# if model_mode == 'MF':
test_df_index = test_data_all2all['user', 'rates', 'item'].edge_label_index.numpy()
test_df_label = test_data_all2all['user', 'rates', 'item'].edge_label.numpy()
test_df_index = test_df_index.T 
test_df = pd.DataFrame(test_df_index, columns=['user', 'item'])
test_df['rating'] = test_df_label

train_df_index = train_data['user', 'rates', 'item'].edge_label_index.numpy()
train_df_label = train_data['user', 'rates', 'item'].edge_label.numpy()
train_df_index = train_df_index.T 
train_df = pd.DataFrame(train_df_index, columns=['user', 'item'])
train_df['rating'] = train_df_label

def add_topic(df, contract_to_topic_df, unique_item_id):
    item_to_topic = pd.Series(contract_to_topic_df['most_probable_topic'].values, index=contract_to_topic_df['contract_name']).to_dict()
    mappedID_to_itemId = pd.Series(unique_item_id['itemId'].values, index=unique_item_id['mappedID']).to_dict()
    df['item_name'] = df['item'].map(mappedID_to_itemId)
    df['topic'] = df['item_name'].map(item_to_topic).fillna(0).astype(int)
    df = df.drop(columns=['item_name'])
    
    return df

contract_to_topic_df = pd.read_parquet("dataset/contract_name_topic.parquet")
test_df = add_topic(test_df, contract_to_topic_df, unique_item_id)
train_df = add_topic(train_df, contract_to_topic_df, unique_item_id)

In [None]:
######## NAME LEVEL MF TRAIN & PRED #########
from lightfm import LightFM
from lightfm.data import Dataset

dataset = Dataset()
# dataset.fit(data['user'].node_id.numpy(), data['item'].node_id.numpy())
user_ids = np.union1d(train_df['user'].unique(), test_df['user'].unique())
item_ids = np.union1d(train_df['item'].unique(), test_df['item'].unique())
dataset.fit(user_ids, item_ids)
user_ids_mapping, _, item_ids_mapping, _ = dataset.mapping()

(train_interactions, train_interactions_weight) = dataset.build_interactions((row['user'], row['item'], row['rating']) for index, row in train_df.iterrows())

model = LightFM(loss='warp')
model.fit(train_interactions, epochs=30, num_threads=2, sample_weight=train_interactions_weight)

test_df['pred_nmf'] = 0

for user, user_data in tqdm(test_df.groupby('user'), total=test_df['user'].nunique()):
    user_id_internal = user_ids_mapping[user]
    item_ids_internal = np.array([item_ids_mapping[item] for item in user_data['item']])
    predictions = model.predict(user_id_internal, item_ids_internal)
    test_df.loc[user_data.index, 'pred_nmf'] = predictions

pred_nmf = test_df['pred_nmf'].to_numpy()
ground_truth_nmf = test_df['rating'].to_numpy()

In [None]:
######## CONTRACT LEVEL MF TRAIN & PRED #########
from lightfm import LightFM
from lightfm.data import Dataset
dataset = Dataset()
user_ids = np.union1d(train_df['user'].unique(), test_df['user'].unique())
item_ids = np.union1d(train_df['topic'].unique(), test_df['topic'].unique())
dataset.fit(user_ids, item_ids)
user_ids_mapping, _, item_ids_mapping, _ = dataset.mapping()

(train_interactions, train_interactions_weight) = dataset.build_interactions((row['user'], row['topic'], row['rating']) for index, row in train_df.iterrows())

model = LightFM(loss='warp')
model.fit(train_interactions, epochs=30, num_threads=2, sample_weight=train_interactions_weight)

def topic_popular_contracts(df):
    item_rating_sum = df.groupby(['topic', 'item'])['rating'].sum().reset_index()
    sorted_items = item_rating_sum.sort_values(['topic', 'rating'], ascending=[True, False])
    topic_to_popular_items = {k: g['item'].tolist() for k, g in sorted_items.groupby('topic')}
    return topic_to_popular_items

test_df['pred_cmf'] = 0
topic_popular_contracts_dict = topic_popular_contracts(test_df)

for user, user_data in tqdm(test_df.groupby('user'), total=test_df['user'].nunique()):
    user_id_internal = user_ids_mapping[user]
    item_ids_internal = np.array([item_ids_mapping[item] for item in user_data['topic']])
    predictions = model.predict(user_id_internal, item_ids_internal)
    test_df.loc[user_data.index, 'pred_cmf'] = predictions

pred_cmf = test_df['pred_cmf'].to_numpy()
ground_truth_cmf = test_df['rating'].to_numpy()

In [79]:
####### METRIC EVAL #######

def precision_at_k(user_id, sorted_indices, ground_truth, k):
    top_k_indices = sorted_indices[:k]
    top_k_labels = ground_truth[top_k_indices]
    num_ones = np.sum(ground_truth == 1)
    hit = np.sum(top_k_labels > 0)
    return hit / min(num_ones, k) # k

def average_hit_at_k(k, ground_truth, pred, user_ids, edge_index):
    precisions = []
    for user_id in tqdm(user_ids, total=len(user_ids)):
        mask = edge_index[0] == user_id
        filtered_pred = pred[mask]
        filtered_ground_truth = ground_truth[mask]
        sorted_indices = np.argsort(filtered_pred)[::-1]
        
        precisions.append(
            precision_at_k(user_id, sorted_indices, filtered_ground_truth, k)
        )
        
    return np.mean(precisions)

def dcg_at_k(r, k):
    """
    Compute DCG@k for a list of relevance scores
    
    Parameters:
    - r: Relevance scores in rank order
    - k: Rank
    
    Returns:
    - DCG@k
    """
    r = np.asfarray(r)[:k]
    return np.sum(r / np.log2(np.arange(2, r.size + 2)))

def ndcg_at_k(r, k):
    """
    Compute NDCG@k for a list of relevance scores
    
    Parameters:
    - r: Relevance scores in rank order
    - k: Rank
    
    Returns:
    - NDCG@k
    """
    dcg_max = dcg_at_k(sorted(r, reverse=True), k)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k) / dcg_max

def calculate_ndcg_at_k(k, ground_truth, pred, edge_index):
    """
    Calculate the average NDCG@k for all users
    
    Parameters:
    - k: Rank
    - ground_truth: True relevance scores
    - pred: Predicted relevance scores
    - edge_index: User-item interaction indices
    
    Returns:
    - Average NDCG@k
    """
    user_ids = np.unique(edge_index[0].numpy())
    ndcgs = []
    for user_id in tqdm(user_ids, total=len(user_ids)):
        mask = edge_index[0] == user_id
        filtered_pred = pred[mask]
        filtered_ground_truth = ground_truth[mask]
        
        # Sort by predicted score
        sorted_indices = np.argsort(filtered_pred)[::-1]
        sorted_ground_truth = filtered_ground_truth[sorted_indices]
        
        ndcgs.append(ndcg_at_k(sorted_ground_truth, k))
        
    return np.mean(ndcgs)

def average_precision_at_k(user_id, sorted_indices, ground_truth, k):
    """
    Computes the average precision at k for a single user.
    
    Args:
    user_id: The user id.
    sorted_indices: Indices that would sort the predicted ratings.
    ground_truth: Actual ratings (binary) indicating whether an item is relevant or not.
    k: The number of recommendations to consider.
    
    Returns:
    The average precision at k for the given user.
    """
    top_k_indices = sorted_indices[:k]
    top_k_labels = ground_truth[top_k_indices]
    
    relevant_indices = np.where(top_k_labels > 0)[0]
    num_relevant = len(relevant_indices)
    
    if num_relevant == 0:
        return 0
    
    score = 0.0
    for i in relevant_indices:
        prec_at_i = np.sum(top_k_labels[:i+1]) / (i + 1)
        score += prec_at_i
    
    return score / min(num_relevant, k)

def mean_ap_at_k(k, ground_truth, pred, user_ids, edge_index):
    """
    Computes the mean average precision at k.
    
    Args:
    k: The number of recommendations to consider.
    ground_truth: Actual ratings (binary) indicating whether an item is relevant or not.
    pred: Predicted ratings.
    
    Returns:
    The mean average precision at k over all users.
    """
    
    average_precisions = []
    for user_id in tqdm(user_ids, total=len(user_ids)):
        mask = edge_index[0] == user_id
        filtered_pred = pred[mask]
        filtered_ground_truth = ground_truth[mask]
        sorted_indices = np.argsort(filtered_pred)[::-1]
        
        average_precisions.append(
            average_precision_at_k(user_id, sorted_indices, filtered_ground_truth, k)
        )
        
    return np.mean(average_precisions)

def evaluate(k_values, test_data_all2all, ground_truth, pred):
    edge_index = test_data_all2all['user', 'rates', 'item'].edge_label_index
    user_ids = np.unique(edge_index[0].numpy())

    for k in k_values:
        ### HIT@K ###
        hit_at_k = average_hit_at_k(k, ground_truth, pred, user_ids, edge_index)
        print(f"HIT@{k}: {hit_at_k}")
    for k in k_values:
        ### NDCG@K ###
        ndcg_result = calculate_ndcg_at_k(k, ground_truth, pred, edge_index)
        print(f"NDCG@{k}: {ndcg_result}")
    for k in k_values:
        map_at_k = mean_ap_at_k(k, ground_truth, pred, user_ids, edge_index)
        print(f"MAP@{k}: {map_at_k}")


eval_loader = {
    'gnn': {
        'ground_truth': ground_truth_gnn,
        'pred': pred_gnn
    },
    # 'nmf': {
    #     'ground_truth': ground_truth_nmf,
    #     'pred': pred_nmf
    # },
    # 'cmf': {
    #     'ground_truth': ground_truth_cmf,
    #     'pred': pred_cmf
    # },

}
k_values = [1, 5]
evaluate(k_values, test_data_all2all, ground_truth=eval_loader[model_variant_eval]['ground_truth'], pred=eval_loader[model_variant_eval]['pred'])
''' without social edges
HIT@1: 0.03825779870512066
HIT@5: 0.1256131057484795
NDCG@1: 0.03825779870512066
NDCG@5: 0.08228435972107628
MAP@1: 0.03825779870512066
MAP@5: 0.07119874435942712
'''
''' with social edges

'''

100%|██████████| 1699/1699 [00:06<00:00, 280.68it/s]


HIT@1: 0.03825779870512066


100%|██████████| 1699/1699 [00:06<00:00, 268.96it/s]


HIT@5: 0.1256131057484795


100%|██████████| 1699/1699 [00:07<00:00, 239.38it/s]


NDCG@1: 0.03825779870512066


100%|██████████| 1699/1699 [00:07<00:00, 241.44it/s]


NDCG@5: 0.08228435972107628


100%|██████████| 1699/1699 [00:06<00:00, 277.54it/s]


MAP@1: 0.03825779870512066


100%|██████████| 1699/1699 [00:06<00:00, 276.19it/s]

MAP@5: 0.07119874435942712





' with social edges\n\n'

In [None]:
############# DIVERSITY EXPERIMENT ##############
if experiment == 'diversity':
    edge_index = test_data_all2all['user', 'rates', 'item'].edge_label_index
    user_ids = np.unique(edge_index[0].numpy())
    pred = eval_loader[model_variant_eval]['pred']
    ground_truth = eval_loader[model_variant_eval]['ground_truth']

    for k in k_values:
        recs_list = set()
        for user_id in tqdm(user_ids, total=len(user_ids)):
            mask = edge_index[0] == user_id
            filtered_pred = pred[mask]
            filtered_items = edge_index[1][mask]
            sorted_indices = np.argsort(filtered_pred)[::-1]
            top_k_indices = sorted_indices[:k]
            top_k_indices = top_k_indices.copy()
            top_k_items = filtered_items[top_k_indices].numpy()
            recs_list.update(top_k_items)

        diversity_at_k = len(recs_list) / len(np.unique(edge_index[1].numpy()))
        print(f'Item coverage diversity for {model_variant_eval} @{k}:', diversity_at_k)
    
    for k in k_values:
        users_with_relevant_recs = set()
        
        for user_id in tqdm(user_ids, total=len(user_ids)):
            mask = edge_index[0] == user_id
            filtered_pred = pred[mask]
            sorted_indices = np.argsort(filtered_pred)[::-1]
            top_k_indices = sorted_indices[:k]
            filtered_ground_truth = ground_truth[mask] 
            relevant_recs = filtered_ground_truth[top_k_indices] 
            
            if np.sum(relevant_recs) > 0:  # At least one relevant recommendation
                users_with_relevant_recs.add(user_id)
        
        user_coverage_at_k = len(users_with_relevant_recs) / len(user_ids)
        print(f'User coverage for {model_variant_eval} @{k}:', user_coverage_at_k)

    #######  Intra-List Diversity #######
    # TODO: Based on item_feat define the compute_dissimilarity method
    # for k in k_values:
    #     avg_dissimilarity = []
        
    #     for user_id in tqdm(user_ids, total=len(user_ids)):
    #         mask = edge_index[0] == user_id
    #         filtered_pred = pred[mask]
    #         filtered_items = edge_index[1][mask]
    #         sorted_indices = np.argsort(filtered_pred)[::-1]
    #         top_k_indices = sorted_indices[:k]
    #         top_k_items = filtered_items[top_k_indices].numpy()
            
    #         dissimilarity_sum = 0
    #         for i in range(len(top_k_items)):
    #             for j in range(i+1, len(top_k_items)):
    #                 dissimilarity_sum += compute_dissimilarity(top_k_items[i], top_k_items[j])
            
    #         if k > 1:
    #             avg_pairwise_dissimilarity = 2 * dissimilarity_sum / (k * (k - 1))
    #             avg_dissimilarity.append(avg_pairwise_dissimilarity)
        
    #     intra_list_diversity_at_k = np.mean(avg_dissimilarity)
    #     print(f'Intra-list diversity for {model_mode_eval} @{k}:', intra_list_diversity_at_k)



In [None]:
######### HIT@K EVAL V1 ##########
# in val_data len(edge_index) = 80670, but len(edge_label_index) = 30249, we selected edge_label_index since for train_loader used the same
def precision_at_k(user_id, edge_index, ground_truth, pred, k):

    mask = edge_index[0] == user_id
    filtered_pred = pred[mask]
    filtered_ground_truth = ground_truth[mask]
    sorted_indices = filtered_pred.argsort()[:: -1]

    top_k = [(filtered_ground_truth[i], filtered_pred[i]) for i in sorted_indices[:k]]
    hit = 0
    for i in range(len(top_k)):
        ground_truth, pred = top_k[i]
        if ground_truth > 0 and pred > 0: # I think we should remove this: and pred > 0:
            hit += 1
    precision = hit / k

    return precision


def ap_at_k(k, precision_at_k, mode):
    precisions = []
    edge_index = val_loader.data['user', 'rates', 'item'].edge_label_index
    for user_id in tqdm(edge_index[0], total=len(edge_index[0])):
        if mode == 'nmf':
            precisions.append(precision_at_k(user_id, edge_index, ground_truth, pred_nmf, k)) # ground_truth is the same for both GNN and mf
        if mode == 'cmf':
            precisions.append(precision_at_k(user_id, edge_index, ground_truth, pred_cmf, k))
        else:
            precisions.append(precision_at_k(user_id, edge_index, ground_truth, pred, k))
            break

    return np.mean(precisions)


k_values = [1, 2, 3, 4, 5]
for k in k_values:
    hit_at_k = ap_at_k(k, precision_at_k, mode='GNN')
    print(f"AP@{k}:", hit_at_k)

