In [None]:
pip uninstall torch -y

In [None]:
pip install torch==1.13.1

In [None]:
!pip uninstall torch-scatter -y
!pip uninstall torch-sparse -y
!pip uninstall pyg-lib -y
!pip uninstall git+https://github.com/pyg-team/pytorch_geometric.git -y

In [None]:
# Install required packages.
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install pyg-lib -f https://data.pyg.org/whl/nightly/torch-${TORCH}.html
!pip install git+https://github.com/pyg-team/pytorch_geometric.git

In [2]:
import numpy as np
import pandas as pd
import random
# from neo4j import GraphDatabase
from torch_geometric.data import Data
import torch
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
from tqdm import tqdm
from collections import defaultdict
import json
import multiprocessing
import matplotlib.pyplot as plt

from sklearn import preprocessing, feature_extraction, model_selection
from sklearn.metrics import mean_absolute_error, mean_squared_error
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
from torch_geometric.loader import LinkNeighborLoader
from torch_geometric.nn import SAGEConv, to_hetero
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score

In [30]:
#### DATA LOADER ####
from torch_geometric.data import download_url, extract_zip
from torch import Tensor

def data_loader(ratings_df):
    # Create a mapping from unique user indices to range [0, num_user_nodes):
    unique_user_id = ratings_df['userId'].unique()
    unique_user_id = pd.DataFrame(data={
        'userId': unique_user_id,
        'mappedID': pd.RangeIndex(len(unique_user_id)),
    })
    print("Mapping of user IDs to consecutive values:")
    print("==========================================")
    print(unique_user_id.head())
    print()
    # Create a mapping from unique movie indices to range [0, num_movie_nodes):
    unique_item_id = ratings_df['itemId'].unique()
    unique_item_id = pd.DataFrame(data={
        'itemId': unique_item_id,
        'mappedID': pd.RangeIndex(len(unique_item_id)),
    })
    print("Mapping of item IDs to consecutive values:")
    print("===========================================")
    print(unique_item_id.head())

    ratings_user_id = pd.merge(ratings_df['userId'], unique_user_id,
                                left_on='userId', right_on='userId', how='left')
    ratings_user_id = torch.from_numpy(ratings_user_id['mappedID'].values)
    ratings_item_id = pd.merge(ratings_df['itemId'], unique_item_id,
                                left_on='itemId', right_on='itemId', how='left')
    ratings_item_id = torch.from_numpy(ratings_item_id['mappedID'].values)
    # With this, we are ready to construct our `edge_index` in COO format
    # following PyG semantics:
    edge_index_user_to_item = torch.stack([ratings_user_id, ratings_item_id], dim=0)
    # assert edge_index_user_to_item.size() == (2, 100836)
    print()
    print("Final edge indices pointing from users to items:")
    print("=================================================")
    print(edge_index_user_to_item)
    return unique_user_id, unique_item_id, edge_index_user_to_item

# ##### MOVIELENS DATA ########
# url = 'https://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
# extract_zip(download_url(url, '.'), '.')
# movies_path = './ml-latest-small/movies.csv'
# ratings_path = './ml-latest-small/ratings.csv'
# items_ratings_df = pd.read_csv(ratings_path)
# items_ratings_df = items_ratings_df.rename(columns={'movieId': 'itemId'})
# unique_user_id, unique_item_id, edge_index_user_to_movie = data_loader(items_ratings_df)
# items_df = pd.read_csv(movies_path)
# items_df = items_df.rename(columns={'movieId': 'itemId', 'title': 'name'})
# items_df = pd.merge(items_df, unique_item_id, on='itemId', how='left')
# items_df = items_df.sort_values('mappedID') # (Just the last 20 movies have NaN mappedId)
# genres = items_df['genres'].str.get_dummies('|')
# print(genres[["Action", "Adventure", "Drama", "Horror"]].head())
# movie_feat = torch.from_numpy(genres.values).to(torch.float)
# # assert movie_feat.size() == (9742, 20)  # 20 genres in total.

###### CONTRACTS DATA ########
items_ratings_df = pd.read_parquet('dataset/user_contract_rating.parquet')
items_ratings_df = items_ratings_df[:10000]
items_df = {}
items_df['name'] = items_ratings_df['item'].unique()
items_df['itemId'], unique_names = pd.factorize(items_df['name'])
items_df['itemId'] = items_df['itemId'] + 1
items_df = pd.DataFrame(items_df, columns=['itemId', 'name'])

contract_top_words_df = pd.read_parquet('dataset/contract_top_words.parquet')
contract_top_words_df = contract_top_words_df.rename(columns={'contract_name': 'name'})
contracts_df_top_words = items_df.merge(contract_top_words_df, on='name', how='left')
contracts_df_top_words['keywords'] = contracts_df_top_words['keywords'].fillna('')
items_df = contracts_df_top_words
items_df.set_index('itemId', inplace=True)
# f =5
items_df['truncated_keywords'] = items_df['keywords'].apply(lambda x: ','.join(x.split(',')))
X_df = items_df['truncated_keywords'].str.get_dummies(',')
contract_feat = torch.from_numpy(X_df.values).to(torch.float)
print(contract_feat.shape)
print(contract_feat)
print(items_df)
print(X_df)
items_ratings_df = items_ratings_df.rename(columns={'user': 'userId', 'item': 'itemId'})
unique_user_id, unique_item_id, edge_index_user_to_contract = data_loader(items_ratings_df)

torch.Size([4343, 4338])
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 1.,  ..., 0., 0., 0.]])
                               name   
itemId                                
1                       TetherToken  \
2       TransparentUpgradeableProxy   
3                       BridgeToken   
4                             Token   
5                           Seaport   
...                             ...   
4339                   GenesisVault   
4340                        LetsApe   
4341                        Isoroom   
4342                     AstroGator   
4343                  ZooFrenzToken   

                                                 keywords   
itemId                                                      
1       dev, erc20, upgraded, address, oncode, title, ...  \
2       proxy, admin, proxyadm

In [37]:
########## PYG BINARY LINK PREDICTION ############

def GNN_recommender(unique_user_id, items_df, item_feat, edge_index_user_to_item):
    data = HeteroData()
    data["user"].node_id = torch.arange(len(unique_user_id))
    data["item"].node_id = torch.arange(len(items_df))
    data["item"].x = item_feat
    data["user", "rates", "item"].edge_index = edge_index_user_to_item
    data = T.ToUndirected()(data)

    # For this, we first split the set of edges into
    # training (80%), validation (10%), and testing edges (10%).
    # Across the training edges, we use 70% of edges for message passing,
    # and 30% of edges for supervision.
    # We further want to generate fixed negative edges for evaluation with a ratio of 2:1.
    # Negative edges during training will be generated on-the-fly.
    # We can leverage the `RandomLinkSplit()` transform for this from PyG:
    transform = T.RandomLinkSplit(
        num_val=0.1,
        num_test=0.1,
        disjoint_train_ratio=0.3,
        neg_sampling_ratio=2.0,
        add_negative_train_samples=False,
        edge_types=("user", "rates", "item"),
        rev_edge_types=("item", "rev_rates", "user"), 
    )
    train_data, val_data, test_data = transform(data)

    # In the first hop, we sample at most 20 neighbors.
    # In the second hop, we sample at most 10 neighbors.
    # In addition, during training, we want to sample negative edges on-the-fly with
    # a ratio of 2:1.
    # We can make use of the `loader.LinkNeighborLoader` from PyG:

    # Define seed edges:
    edge_label_index = train_data["user", "rates", "item"].edge_label_index
    edge_label = train_data["user", "rates", "item"].edge_label
    train_loader = LinkNeighborLoader(
        data=train_data,
        num_neighbors=[20, 10],
        neg_sampling_ratio=2.0,
        edge_label_index=(("user", "rates", "item"), edge_label_index),
        edge_label=edge_label,
        batch_size=128,
        shuffle=True,
    )

    class GNN(torch.nn.Module):
        def __init__(self, hidden_channels):
            super().__init__()
            self.conv1 = SAGEConv(hidden_channels, hidden_channels)
            self.conv2 = SAGEConv(hidden_channels, hidden_channels)
        def forward(self, x: Tensor, edge_index: Tensor) -> Tensor:
            x = F.relu(self.conv1(x, edge_index))
            x = self.conv2(x, edge_index)
            return x
    # Our final classifier applies the dot-product between source and destination
    # node embeddings to derive edge-level predictions:
    class Classifier(torch.nn.Module):
        def forward(self, x_user: Tensor, x_item: Tensor, edge_label_index: Tensor) -> Tensor:
            edge_feat_user = x_user[edge_label_index[0]] # Convert node embeddings to edge-level representations:
            edge_feat_item = x_item[edge_label_index[1]]
            scores = (edge_feat_user * edge_feat_item).sum(dim=-1)
            return scores # Apply dot-product to get a prediction per supervision edge:
        
    class Model(torch.nn.Module):
        def __init__(self, hidden_channels):
            super().__init__()
            # Since the dataset does not come with rich features, we also learn two
            # embedding matrices for users and items:
            self.item_lin = torch.nn.Linear(contract_feat.shape[1], hidden_channels) #put contract_feat.shape[1] for contracts
            self.user_emb = torch.nn.Embedding(data["user"].num_nodes, hidden_channels)
            self.item_emb = torch.nn.Embedding(data["item"].num_nodes, hidden_channels)
            # Instantiate homogeneous GNN:
            self.gnn = GNN(hidden_channels)
            # Convert GNN model into a heterogeneous variant:
            self.gnn = to_hetero(self.gnn, metadata=data.metadata())
            self.classifier = Classifier()

        def forward(self, data: HeteroData) -> Tensor:
            x_dict = {
            "user": self.user_emb(data["user"].node_id),
            "item": self.item_lin(data["item"].x) + self.item_emb(data["item"].node_id),
            } 
            # `x_dict` holds feature matrices of all node types
            # `edge_index_dict` holds all edge indices of all edge types
            x_dict = self.gnn(x_dict, data.edge_index_dict)
            pred = self.classifier(
                x_dict["user"],
                x_dict["item"],
                data["user", "rates", "item"].edge_label_index,
            )
            return pred
            
    ########## TRAINING ##########
    model = Model(hidden_channels=64)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Device: '{device}'")
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    for epoch in range(1, 10):
        total_loss = total_examples = 0
        for sampled_data in tqdm(train_loader):
            optimizer.zero_grad()
            sampled_data.to(device)
            pred = model(sampled_data)
            ground_truth = sampled_data["user", "rates", "item"].edge_label
            loss = F.binary_cross_entropy_with_logits(pred, ground_truth)
            loss.backward()
            optimizer.step()
            total_loss += float(loss) * pred.numel()
            total_examples += pred.numel()

        # TODO: Add the val_loader, keep the best model
        print(f"Epoch: {epoch:03d}, Loss: {total_loss / total_examples:.4f}")

    ########## EVAL VALIDATION #########
    edge_label_index = val_data["user", "rates", "item"].edge_label_index
    edge_label = val_data["user", "rates", "item"].edge_label
    # val_data has neg samples in it
    val_loader = LinkNeighborLoader(
        data=val_data,
        num_neighbors=[20, 10],
        edge_label_index=(("user", "rates", "item"), edge_label_index),
        edge_label=edge_label,
        batch_size=3 * 128,
        shuffle=False,
    )
    sampled_data = next(iter(val_loader))
    preds = []
    ground_truths = []
    for sampled_data in tqdm(val_loader):
        with torch.no_grad():
            sampled_data.to(device)
            preds.append(model(sampled_data))
            ground_truths.append(sampled_data["user", "rates", "item"].edge_label)
    pred = torch.cat(preds, dim=0).cpu().numpy()
    ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()
    auc = roc_auc_score(ground_truth, pred)
    print()
    print(f"Validation AUC: {auc:.4f}")
    return data, train_data, val_data, train_loader, val_loader, ground_truth, pred, test_data, model

In [None]:
########## SIMPLE PYG BINARY LINK PREDICTION ############
from torch.utils.data import random_split
from torch.utils.data import DataLoader

# INPROGRESS
#TODO: split the edge_index, then for each set create the x and fed along edge_index{set_nme}
x_user = torch.randn(len(unique_user_id), 20)  # node features
x_item = movie_feat
print(movie_feat)
x = torch.cat((x_user, x_item), dim=0)

data = Data(x=x, edge_index=edge_index_user_to_movie)
print(data)
train_length = int(0.8 * len(data))
test_length = len(data) - train_length

train_data, test_data = random_split(data, [train_length, test_length])
print(train_data)


unique_users = set()
unique_items = set()

for data in train_loader:
    user, item = data.edge_index
    unique_users.update(user.tolist())
    unique_items.update(item.tolist())

train_target = torch.zeros(len(unique_users), len(unique_items), dtype=torch.float32)

for data in train_loader:
    user, item = data.edge_index  # Assuming each data point is a tuple of (user, item)
    train_target[user, item] = 1.0


# class GNNRec(torch.nn.Module):
#     def __init__(self, in_channels, out_channels):
#         super(GNNRec, self).__init__()
#         self.conv1 = GCNConv(in_channels, 64)
#         self.conv2 = GCNConv(64, out_channels)
    
#     def forward(self, data):
#         x, edge_index = data.x, data.edge_index
#         x = F.relu(self.conv1(x, edge_index))
#         x = self.conv2(x, edge_index)
#         return x

# model = GNNRec(16, 32)
# optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# # Function to compute the prediction scores using dot product
# def compute_scores(user_embeddings, movie_embeddings):
#     scores = torch.matmul(user_embeddings, movie_embeddings.t())
#     return torch.sigmoid(scores)

# # Sample training loop
# model.train()
# for epoch in range(10):
#     optimizer.zero_grad()
#     out = model(data)
#     user_embeddings, movie_embeddings = out[:len(unique_user_id)], out[len(unique_user_id):] # this should be len(users) and len(movies)
#     scores = compute_scores(user_embeddings, movie_embeddings)
#     # Sample loss: You would ideally want pairs with interactions to have scores close to 1 and others close to 0
#     target = torch.tensor([[1, 0, 0], 
#                            [0, 1, 1],
#                            [0, 0, 1]], dtype=torch.float) # This is a placeholder. Replace with your actual targets.
#     loss = F.binary_cross_entropy(scores, target)
#     loss.backward()
#     optimizer.step()

# # Predictions
# model.eval()
# with torch.no_grad():
#     out = model(data)
#     user_embeddings, movie_embeddings = out[:3], out[3:]
#     predictions = compute_scores(user_embeddings, movie_embeddings)
    
# print(predictions)

In [38]:
########## EXPERIMENTS ############
###### MOVIELENS MODEL ######
# data, train_data, val_data, train_loader, val_loader, ground_truth, pred, test_data = GNN_recommender(unique_user_id, items_df, movie_feat, edge_index_user_to_movie)

# ablation_without_movie_feature = torch.zeros_like(movie_feat)
# data, train_data, val_data, train_loader, val_loader, ground_truth, pred, test_data = GNN_recommender(unique_user_id, items_df, ablation_without_movie_feature, edge_index_user_to_movie)

###### CONTRACTS MODEL ######
data, train_data, val_data, train_loader, val_loader, ground_truth, pred, test_data, model = GNN_recommender(unique_user_id, items_df, contract_feat, edge_index_user_to_contract)

# Ablation study of GNN
# ablation_without_contract_feature = torch.zeros_like(contract_feat)
# data, train_data, val_data, train_loader, val_loader, ground_truth, pred, test_data = GNN_recommender(unique_user_id, items_df, ablation_without_contract_feature, edge_index_user_to_contract)



# Cold Start User: Just need to change the test_df

# Cold Start Item

# Diversity

# Contract Representation (effect of f top-keywords in contract_feat)

# Sparsity (keep users just with > h interactions)

Device: 'cuda'


  0%|          | 0/19 [00:00<?, ?it/s]

100%|██████████| 19/19 [00:00<00:00, 19.78it/s]


Epoch: 001, Loss: 0.6357


100%|██████████| 19/19 [00:01<00:00, 18.73it/s]


Epoch: 002, Loss: 0.5014


100%|██████████| 19/19 [00:00<00:00, 19.49it/s]


Epoch: 003, Loss: 0.4289


100%|██████████| 19/19 [00:00<00:00, 19.49it/s]


Epoch: 004, Loss: 0.3758


100%|██████████| 19/19 [00:00<00:00, 20.18it/s]


Epoch: 005, Loss: 0.3340


100%|██████████| 19/19 [00:00<00:00, 19.05it/s]


Epoch: 006, Loss: 0.3084


100%|██████████| 19/19 [00:00<00:00, 20.09it/s]


Epoch: 007, Loss: 0.2837


100%|██████████| 19/19 [00:00<00:00, 20.50it/s]


Epoch: 008, Loss: 0.2595


100%|██████████| 19/19 [00:00<00:00, 20.68it/s]


Epoch: 009, Loss: 0.2413


100%|██████████| 8/8 [00:00<00:00, 20.27it/s]


Validation AUC: 0.8713





In [52]:
# How to pick a slice from val_data? This will yield the preds in less time since we will have less unique_user and items
# print(val_data)
print(train_data)
print(data)


HeteroData(
  user={ node_id=[1140] },
  item={
    node_id=[4343],
    x=[4343, 4338],
  },
  (user, rates, item)={
    edge_index=[2, 5600],
    edge_label=[2400],
    edge_label_index=[2, 2400],
  },
  (item, rev_rates, user)={ edge_index=[2, 5600] }
)
HeteroData(
  user={ node_id=[1140] },
  item={
    node_id=[4343],
    x=[4343, 4338],
  },
  (user, rates, item)={ edge_index=[2, 10000] },
  (item, rev_rates, user)={ edge_index=[2, 10000] }
)


In [40]:
######## GNN PRED FOR ALL USER ITEM PAIRS IN VAL_LOADER #########
# BUG: now we are adding the (user,item) pair for all users and items in data, but it should be just for test_data, maybe we can regenerate the test_data with neg_sample=0
edge_index_val = set(zip(val_data["user", "rates", "item"].edge_label_index[0].numpy(), val_data["user", "rates", "item"].edge_label_index[1].numpy()))

all_users = val_data["user", "rates", "item"].edge_label_index[0].unique().numpy()
all_items = val_data["user", "rates", "item"].edge_label_index[1].unique().numpy()

new_edges = []
new_labels = []

for user_id in all_users:
    for item_id in all_items:
        if (user_id, item_id) not in edge_index_val:
            new_edges.append((user_id, item_id))
            new_labels.append(0)

# Convert lists to tensors and concatenate them to the original tensors
if new_edges:
    new_edges_tensor = torch.tensor(new_edges, dtype=torch.int64).t().contiguous()
    new_labels_tensor = torch.tensor(new_labels, dtype=torch.int64)

    val_data["user", "rates", "item"].edge_label_index = torch.cat((val_data["user", "rates", "item"].edge_label_index, new_edges_tensor), dim=1)
    val_data["user", "rates", "item"].edge_label = torch.cat((val_data["user", "rates", "item"].edge_label, new_labels_tensor), dim=0)

#TODO: The way that we calculating the hit@k dosen't make sense, what about given the positive edges in val_set, group by user, get the prediction list for user and all items
# sort the preds, now we cna have hit@k and NDCG and very other metrics. If it takes too long, simply get a slice of val_data

# val_data_pos = val_loader_df[val_loader_df['rating'] == 1]
# data = HeteroData()
# data["user"].node_id = torch.from_numpy(val_data_pos['user'].unique()).to(torch.int64)
# data["item"].node_id = torch.from_numpy(val_data_pos['item'].unique()).to(torch.int64)
# #TODO Is this contract_feat correct, or we should fed all the contratcs_feat instead of selecting festures just for unqiue contracts in this small set?
# data["item"].x = contract_feat[torch.from_numpy(val_data_pos['item'].unique()).to(torch.long)] 
# # data["item"].x = contract_feat
# # for each user in unique_users, get the top-k item predicted by model
# ratings_user_id = torch.empty(0, dtype=torch.int64)
# ratings_item_id = torch.empty(0, dtype=torch.int64)
# for user in val_data_pos['user'].unique():
#     ratings_user_id = torch.cat((ratings_user_id, torch.full((len(val_data_pos['item'].unique()), ), user, dtype=torch.int64)), dim=0)
#     ratings_item_id = torch.cat((ratings_item_id, data["item"].node_id), dim=0)

# data["user", "rates", "item"].edge_index = torch.stack([ratings_user_id, ratings_item_id], dim=0)
# temp_ground_truth = []
# for user, item in zip(data["user", "rates", "item"].edge_index.numpy()[0], data["user", "rates", "item"].edge_index.numpy()[1]):
#     if val_data_pos[(val_data_pos['user'] == user) & (val_data_pos['item'] == item)].shape[0] > 0:
#         temp_ground_truth.append(1)
#     else:
#         temp_ground_truth.append(0)
# data["user", "rates", "item"].edge_label = torch.tensor(temp_ground_truth).to(torch.int64)
# data = T.ToUndirected()(data)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
val_loader_temp = LinkNeighborLoader(
    data=val_data,
    num_neighbors=[20, 10],
    edge_label_index=(("user", "rates", "item"), val_data["user", "rates", "item"].edge_label_index),
    edge_label=val_data["user", "rates", "item"].edge_label,
    batch_size=3 * 128,
    shuffle=False,
)
sampled_data = next(iter(val_loader_temp))
preds = []
ground_truths = []
for sampled_data in tqdm(val_loader_temp):
    with torch.no_grad():
        sampled_data.to(device)
        preds.append(model(sampled_data))
        ground_truths.append(sampled_data["user", "rates", "item"].edge_label)
pred = torch.cat(preds, dim=0).cpu().numpy()
ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()

print(len(ground_truth))
print(len(val_data["user", "rates", "item"].edge_label_index[0].unique()))
print(len(val_data["user", "rates", "item"].edge_label_index[1].unique()))


100%|██████████| 5488/5488 [01:51<00:00, 49.09it/s]


In [None]:
########### DATA PREPRATION FOR MF MODELS #############
print(val_loader.data)
val_loader_df_index = val_loader.data['user', 'rates', 'item'].edge_label_index.numpy()
val_loader_df_label = val_loader.data['user', 'rates', 'item'].edge_label.numpy()
val_loader_df_index = val_loader_df_index.T 
val_loader_df = pd.DataFrame(val_loader_df_index, columns=['user', 'item'])
val_loader_df['rating'] = val_loader_df_label

train_loader_df_index = train_loader.data['user', 'rates', 'item'].edge_label_index.numpy()
train_loader_df_label = train_loader.data['user', 'rates', 'item'].edge_label.numpy()
train_loader_df_index = train_loader_df_index.T 
train_loader_df = pd.DataFrame(train_loader_df_index, columns=['user', 'item'])
train_loader_df['rating'] = train_loader_df_label


contract_to_topic_df = pd.read_parquet("dataset/contract_name_topic.parquet")

def add_topic(df):
    df['topic'] = ''
    for i, item in df.iterrows():
        item_name = unique_item_id[unique_item_id['mappedID'] == item['item']]['itemId']
        topic = contract_to_topic_df[contract_to_topic_df['contract_name'] == item_name.item()]['most_probable_topic']
        
        if topic.shape[0] == 0:  # No matches
            df.at[i, 'topic'] = 0
        elif topic.shape[0] > 1:  # Multiple matches
            df.at[i, 'topic'] = topic.iloc[0].item()  # or handle it differently
        else:  # Exactly one match
            df.at[i, 'topic'] = topic.item()
    
    return df

val_loader_df = add_topic(val_loader_df)
train_loader_df = add_topic(train_loader_df)

In [None]:
######## NAME LEVEL MF TRAIN & PRED #########
from lightfm import LightFM
from lightfm.data import Dataset

dataset = Dataset()
dataset.fit(data['user'].node_id.numpy(), data['item'].node_id.numpy())
user_ids_mapping, _, item_ids_mapping, _ = dataset.mapping()

(train_interactions, train_interactions_weight) = dataset.build_interactions((row['user'], row['item'], row['rating']) for index, row in train_loader_df.iterrows())

model = LightFM(loss='warp')
model.fit(train_interactions, epochs=30, num_threads=2, sample_weight=train_interactions_weight)

val_loader_df['pred_nmf'] = 0
for i, row in val_loader_df.iterrows():
    user_id_internal = user_ids_mapping[row['user']]
    item_id_internal = item_ids_mapping[row['item']]
    val_loader_df['pred_nmf'][i] = model.predict(user_id_internal, [item_id_internal])

pred_nmf = val_loader_df['pred_nmf'].to_numpy()
ground_truth = val_loader_df['rating'].to_numpy()

In [154]:
######## CONTRACT LEVEL MF TRAIN & PRED #########
from lightfm import LightFM
from lightfm.data import Dataset
dataset = Dataset()
dataset.fit(data['user'].node_id.numpy(), np.arange(15)) # since we have 0 to 14 topics
user_ids_mapping, _, item_ids_mapping, _ = dataset.mapping()

(train_interactions, train_interactions_weight) = dataset.build_interactions((row['user'], row['topic'], row['rating']) for index, row in train_loader_df.iterrows())

model = LightFM(loss='warp')
model.fit(train_interactions, epochs=30, num_threads=2, sample_weight=train_interactions_weight)

def topic_popular_contracts(df):
    item_rating_sum = df.groupby(['topic', 'item'])['rating'].sum().reset_index()
    sorted_items = item_rating_sum.sort_values(['topic', 'rating'], ascending=[True, False])
    topic_to_popular_items = {k: g['item'].tolist() for k, g in sorted_items.groupby('topic')}
    return topic_to_popular_items

val_loader_df['pred_cmf'] = 0
topic_popular_contracts_dict = topic_popular_contracts(val_loader_df)
for i, row in val_loader_df.iterrows():
    user_id_internal = user_ids_mapping[row['user']]
    # now topic_id is a prediction, we need to get the pred for all 14 topics, then sort it and return the topic (indices) with highest value
    topic_pred = model.predict(user_id_internal, np.arange(15))
    topic_id = topic_pred.argsort()[::-1][0] # we can get f predicted topic instead of the highest one
    val_loader_df['pred_cmf'][i] = topic_popular_contracts_dict[topic_id] # we are getting all popular contracts in predicted topic


pred_cmf = val_loader_df['pred_cmf'].to_numpy()

[13, 35, 50, 53, 68, 80, 20, 28, 29, 36, 38, 41, 65, 95]


In [None]:
######### HIT@K EVAL V2 ##########
# in val_data len(edge_index) = 80670, but len(edge_label_index) = 30249, we selected edge_label_index since for train_loader used the same
def precision_at_k(user_id, edge_index, ground_truth, pred, k):

    mask = edge_index[0] == user_id
    filtered_pred = pred[mask]
    filtered_ground_truth = ground_truth[mask]
    print(filtered_pred)
    print(len(filtered_pred))
    print(filtered_ground_truth)
    print(len(filtered_ground_truth))
    sorted_indices = filtered_pred.argsort()[:: -1]

    top_k = [(filtered_ground_truth[i], filtered_pred[i]) for i in sorted_indices[:k]]
    hit = 0
    for i in range(len(top_k)):
        ground_truth, pred = top_k[i]
        if ground_truth > 0 and pred > 0: # I think we should remove this: and pred > 0:
            hit += 1
    precision = hit / k

    return precision


def ap_at_k(k, precision_at_k, mode):
    precisions = []
    edge_index = val_loader.data['user', 'rates', 'item'].edge_label_index
    for user_id in tqdm(edge_index[0], total=len(edge_index[0])):
        if mode == 'nmf':
            precisions.append(precision_at_k(user_id, edge_index, ground_truth, pred_nmf, k)) # ground_truth is the same for both GNN and mf
        if mode == 'cmf':
            precisions.append(precision_at_k(user_id, edge_index, ground_truth, pred_cmf, k))
        else:
            precisions.append(precision_at_k(user_id, edge_index, ground_truth, pred, k))
            break

    return np.mean(precisions)


k_values = [1, 2, 3, 4, 5]
for k in k_values:
    hit_at_k = ap_at_k(k, precision_at_k, mode='GNN')
    print(f"AP@{k}:", hit_at_k)



In [None]:
######### HIT@K EVAL V1 ##########
# in val_data len(edge_index) = 80670, but len(edge_label_index) = 30249, we selected edge_label_index since for train_loader used the same
def precision_at_k(user_id, edge_index, ground_truth, pred, k):

    mask = edge_index[0] == user_id
    filtered_pred = pred[mask]
    filtered_ground_truth = ground_truth[mask]
    sorted_indices = filtered_pred.argsort()[:: -1]

    top_k = [(filtered_ground_truth[i], filtered_pred[i]) for i in sorted_indices[:k]]
    hit = 0
    for i in range(len(top_k)):
        ground_truth, pred = top_k[i]
        if ground_truth > 0 and pred > 0: # I think we should remove this: and pred > 0:
            hit += 1
    precision = hit / k

    return precision


def ap_at_k(k, precision_at_k, mode):
    precisions = []
    edge_index = val_loader.data['user', 'rates', 'item'].edge_label_index
    for user_id in tqdm(edge_index[0], total=len(edge_index[0])):
        if mode == 'nmf':
            precisions.append(precision_at_k(user_id, edge_index, ground_truth, pred_nmf, k)) # ground_truth is the same for both GNN and mf
        if mode == 'cmf':
            precisions.append(precision_at_k(user_id, edge_index, ground_truth, pred_cmf, k))
        else:
            precisions.append(precision_at_k(user_id, edge_index, ground_truth, pred, k))
            break

    return np.mean(precisions)


k_values = [1, 2, 3, 4, 5]
for k in k_values:
    hit_at_k = ap_at_k(k, precision_at_k, mode='GNN')
    print(f"AP@{k}:", hit_at_k)

