In [None]:
pip install torch==1.13.1

In [None]:
!pip uninstall torch-scatter -y
!pip uninstall torch-sparse -y
!pip uninstall pyg-lib -y
!pip uninstall git+https://github.com/pyg-team/pytorch_geometric.git -y

In [None]:
# Install required packages.
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install pyg-lib -f https://data.pyg.org/whl/nightly/torch-${TORCH}.html
!pip install git+https://github.com/pyg-team/pytorch_geometric.git

In [1]:
import numpy as np
import pandas as pd
import random
# from neo4j import GraphDatabase
from torch_geometric.data import Data
import torch
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
from tqdm import tqdm
from collections import defaultdict
import json
import multiprocessing
import matplotlib.pyplot as plt

from sklearn import preprocessing, feature_extraction, model_selection
from sklearn.metrics import mean_absolute_error, mean_squared_error

  from .autonotebook import tqdm as notebook_tqdm


In [20]:
#### DATA LOADER ####
from torch_geometric.data import download_url, extract_zip
from torch import Tensor

def data_loader(ratings_df):
    # Create a mapping from unique user indices to range [0, num_user_nodes):
    unique_user_id = ratings_df['userId'].unique()
    unique_user_id = pd.DataFrame(data={
        'userId': unique_user_id,
        'mappedID': pd.RangeIndex(len(unique_user_id)),
    })
    print("Mapping of user IDs to consecutive values:")
    print("==========================================")
    print(unique_user_id.head())
    print()
    # Create a mapping from unique movie indices to range [0, num_movie_nodes):
    unique_item_id = ratings_df['itemId'].unique()
    unique_item_id = pd.DataFrame(data={
        'itemId': unique_item_id,
        'mappedID': pd.RangeIndex(len(unique_item_id)),
    })
    print("Mapping of movie IDs to consecutive values:")
    print("===========================================")
    print(unique_item_id.head())

    ratings_user_id = pd.merge(ratings_df['userId'], unique_user_id,
                                left_on='userId', right_on='userId', how='left')
    ratings_user_id = torch.from_numpy(ratings_user_id['mappedID'].values)
    ratings_item_id = pd.merge(ratings_df['itemId'], unique_item_id,
                                left_on='itemId', right_on='itemId', how='left')
    ratings_item_id = torch.from_numpy(ratings_item_id['mappedID'].values)
    # With this, we are ready to construct our `edge_index` in COO format
    # following PyG semantics:
    edge_index_user_to_item = torch.stack([ratings_user_id, ratings_item_id], dim=0)
    # assert edge_index_user_to_item.size() == (2, 100836)
    print()
    print("Final edge indices pointing from users to movies:")
    print("=================================================")
    print(edge_index_user_to_item)
    return unique_user_id, edge_index_user_to_item

###### MOVIELENS DATA ########
# url = 'https://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
# extract_zip(download_url(url, '.'), '.')
# movies_path = './ml-latest-small/movies.csv'
# ratings_path = './ml-latest-small/ratings.csv'
# movies_df = pd.read_csv(movies_path, index_col='movieId')
# movies_df = movies_df.rename(columns={'movieId': 'itemId'})
# genres = movies_df['genres'].str.get_dummies('|')
# print(genres[["Action", "Adventure", "Drama", "Horror"]].head())
# movie_feat = torch.from_numpy(genres.values).to(torch.float)
# # assert movie_feat.size() == (9742, 20)  # 20 genres in total.
# movies_ratings_df = pd.read_csv(ratings_path)
# movies_ratings_df = movies_ratings_df.rename(columns={'movieId': 'itemId'})
# unique_user_id, edge_index_user_to_movie = data_loader(movies_ratings_df)

###### CONTRACTS DATA ########
contracts_ratings_df = pd.read_parquet('dataset/user_contract_rating.parquet')
contracts_df = {}
contracts_df['name'] = contracts_ratings_df['item'].unique()
contracts_df['itemId'], unique_names = pd.factorize(contracts_df['name'])
contracts_df['itemId'] = contracts_df['itemId'] + 1
contracts_df = pd.DataFrame(contracts_df, columns=['itemId', 'name'])
contracts_df.set_index('itemId', inplace=True)


# contracts_ratings_df = contracts_ratings_df.rename(columns={'user': 'userId', 'item': 'itemId'})
# unique_user_id, edge_index_user_to_contract = data_loader(contracts_ratings_df)

                               name
itemId                             
1                       TetherToken
2       TransparentUpgradeableProxy
3                       BridgeToken
4                             Token
5                           Seaport
...                             ...
31523                          HDTC
31524                      FreeShop
31525                     FaceKnots
31526                   Brainwashed
31527                     EtherBall

[31527 rows x 1 columns]


In [None]:
# TODO: add the top_k_words columns


In [None]:
########## PYG BINARY LINK PREDICTION ############
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T

def GNN_recommender(unique_user_id, items_df, item_feat, edge_index_user_to_item):
    data = HeteroData()
    data["user"].node_id = torch.arange(len(unique_user_id))
    data["item"].node_id = torch.arange(len(items_df))
    data["item"].x = item_feat
    data["user", "rates", "item"].edge_index = edge_index_user_to_item
    data = T.ToUndirected()(data)

    # For this, we first split the set of edges into
    # training (80%), validation (10%), and testing edges (10%).
    # Across the training edges, we use 70% of edges for message passing,
    # and 30% of edges for supervision.
    # We further want to generate fixed negative edges for evaluation with a ratio of 2:1.
    # Negative edges during training will be generated on-the-fly.
    # We can leverage the `RandomLinkSplit()` transform for this from PyG:
    transform = T.RandomLinkSplit(
        num_val=0.1,
        num_test=0.1,
        disjoint_train_ratio=0.3,
        neg_sampling_ratio=2.0,
        add_negative_train_samples=False,
        edge_types=("user", "rates", "item"),
        rev_edge_types=("item", "rev_rates", "user"), 
    )
    train_data, val_data, test_data = transform(data)

    # In the first hop, we sample at most 20 neighbors.
    # In the second hop, we sample at most 10 neighbors.
    # In addition, during training, we want to sample negative edges on-the-fly with
    # a ratio of 2:1.
    # We can make use of the `loader.LinkNeighborLoader` from PyG:
    from torch_geometric.loader import LinkNeighborLoader

    # Define seed edges:
    edge_label_index = train_data["user", "rates", "item"].edge_label_index
    edge_label = train_data["user", "rates", "item"].edge_label
    train_loader = LinkNeighborLoader(
        data=train_data,
        num_neighbors=[20, 10],
        neg_sampling_ratio=2.0,
        edge_label_index=(("user", "rates", "item"), edge_label_index),
        edge_label=edge_label,
        batch_size=128,
        shuffle=True,
    )

    from torch_geometric.nn import SAGEConv, to_hetero
    import torch.nn.functional as F
    class GNN(torch.nn.Module):
        def __init__(self, hidden_channels):
            super().__init__()
            self.conv1 = SAGEConv(hidden_channels, hidden_channels)
            self.conv2 = SAGEConv(hidden_channels, hidden_channels)
        def forward(self, x: Tensor, edge_index: Tensor) -> Tensor:
            x = F.relu(self.conv1(x, edge_index))
            x = self.conv2(x, edge_index)
            return x
    # Our final classifier applies the dot-product between source and destination
    # node embeddings to derive edge-level predictions:
    class Classifier(torch.nn.Module):
        def forward(self, x_user: Tensor, x_item: Tensor, edge_label_index: Tensor) -> Tensor:
            # Convert node embeddings to edge-level representations:
            edge_feat_user = x_user[edge_label_index[0]]
            edge_feat_item = x_item[edge_label_index[1]]
            # Apply dot-product to get a prediction per supervision edge:
            return (edge_feat_user * edge_feat_item).sum(dim=-1)

    class Model(torch.nn.Module):
        def __init__(self, hidden_channels):
            super().__init__()
            # Since the dataset does not come with rich features, we also learn two
            # embedding matrices for users and items:
            self.item_lin = torch.nn.Linear(20, hidden_channels)
            self.user_emb = torch.nn.Embedding(data["user"].num_nodes, hidden_channels)
            self.item_emb = torch.nn.Embedding(data["item"].num_nodes, hidden_channels)
            # Instantiate homogeneous GNN:
            self.gnn = GNN(hidden_channels)
            # Convert GNN model into a heterogeneous variant:
            self.gnn = to_hetero(self.gnn, metadata=data.metadata())
            self.classifier = Classifier()
        def forward(self, data: HeteroData) -> Tensor:
            x_dict = {
            "user": self.user_emb(data["user"].node_id),
            "item": self.item_lin(data["item"].x) + self.item_emb(data["item"].node_id),
            } 
            # `x_dict` holds feature matrices of all node types
            # `edge_index_dict` holds all edge indices of all edge types
            x_dict = self.gnn(x_dict, data.edge_index_dict)
            pred = self.classifier(
                x_dict["user"],
                x_dict["item"],
                data["user", "rates", "item"].edge_label_index,
            )
            return pred
            
    model = Model(hidden_channels=64)

    import tqdm
    import torch.nn.functional as F
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Device: '{device}'")
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    for epoch in range(1, 6):
        total_loss = total_examples = 0
        for sampled_data in tqdm.tqdm(train_loader):
            optimizer.zero_grad()
            sampled_data.to(device)
            pred = model(sampled_data)
            ground_truth = sampled_data["user", "rates", "item"].edge_label
            loss = F.binary_cross_entropy_with_logits(pred, ground_truth)
            loss.backward()
            optimizer.step()
            total_loss += float(loss) * pred.numel()
            total_examples += pred.numel()
        print(f"Epoch: {epoch:03d}, Loss: {total_loss / total_examples:.4f}")

    # Define the validation seed edges:
    edge_label_index = val_data["user", "rates", "item"].edge_label_index
    edge_label = val_data["user", "rates", "item"].edge_label
    # val_data has neg samples in it
    val_loader = LinkNeighborLoader(
        data=val_data,
        num_neighbors=[20, 10],
        edge_label_index=(("user", "rates", "item"), edge_label_index),
        edge_label=edge_label,
        batch_size=3 * 128,
        shuffle=False,
    )
    sampled_data = next(iter(val_loader))
    from sklearn.metrics import roc_auc_score
    preds = []
    ground_truths = []
    #TODO: How to get the predictions in pred just for a user, then calculate avg_percision by knowing the ground_truth 
    # (how many of 1s are ranked in top-k)
    for sampled_data in tqdm.tqdm(val_loader):
        with torch.no_grad():
            sampled_data.to(device)
            preds.append(model(sampled_data))
            ground_truths.append(sampled_data["user", "rates", "item"].edge_label)
    pred = torch.cat(preds, dim=0).cpu().numpy()
    ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()
    auc = roc_auc_score(ground_truth, pred)
    print()
    print(f"Validation AUC: {auc:.4f}")
    return val_data, ground_truth, pred

In [11]:
print(movies_df)

                                             title   
movieId                                              
1                                 Toy Story (1995)  \
2                                   Jumanji (1995)   
3                          Grumpier Old Men (1995)   
4                         Waiting to Exhale (1995)   
5               Father of the Bride Part II (1995)   
...                                            ...   
193581   Black Butler: Book of the Atlantic (2017)   
193583                No Game No Life: Zero (2017)   
193585                                Flint (2017)   
193587         Bungo Stray Dogs: Dead Apple (2018)   
193609         Andrew Dice Clay: Dice Rules (1991)   

                                              genres  
movieId                                               
1        Adventure|Animation|Children|Comedy|Fantasy  
2                         Adventure|Children|Fantasy  
3                                     Comedy|Romance  
4                     

In [None]:
###### MOVIELENS MODEL ######
val_data, ground_truth, pred = GNN_recommender(unique_user_id, movies_df, movie_feat, edge_index_user_to_movie)

###### CONTRACTS MODEL ######
val_data, ground_truth, pred = GNN_recommender(unique_user_id, contracts_df, contract_feat, edge_index_user_to_contarct)

In [3]:
# in val_data len(edge_index) = 80670, but len(edge_label_index) = 30249, we selected edge_label_index since for train_loader used the same

def calculate_ap_at_k(user_id, ground_truth, pred, k):

    mask = edge_index[0] == user_id
    filtered_edge_index = edge_index[:, mask]
    filtered_pred = pred[mask]
    filtered_ground_truth = ground_truth[mask]
    
    sorted_indices = filtered_pred.argsort()[:: -1]
    top_k = [(filtered_ground_truth[i], filtered_pred[i]) for i in sorted_indices[:k]]
    hit = 0
    for i in range(len(top_k)):
        ground_truth, pred = top_k[i]
        if ground_truth > 0 and pred > 0:
            hit += 1
    precision = hit / k

    return precision

k = 5
precisions = []
edge_index = val_data['user', 'rates', 'item'].edge_label_index
for user_id in edge_index[0]:
    precisions.append(calculate_ap_at_k(user_id, ground_truth, pred, k))

hit_at_k = np.mean(precisions)
print(hit_at_k)



0.6997057753975339


In [None]:
# val_data has around 30k edge label, and 30k edge index tuples (from, to). 10k of them are positive edges and others are neg samples

print(val_data['user', 'rates', 'movie'])

# print(pred[:5])
# print(ground_truth[:5])
# print(len(pred))
# print(len(ground_truth))
# print(val_data['user', 'rates', 'movie'].edge_index)
# count = 0
# for sampled_data in val_loader:
#     count += 1
#     if count % 4 == 0 : 
#         print(sampled_data)
#         # print(sampled_data['movie'])
#         # print(sampled_data["user", "rates", "movie"].edge_label)
#         # print(sampled_data["user", "rates", "movie"].edge_label_index)
#         break
# print(val_data)
# print(genres.values[:5])
# print(len(ratings_user_id))
# print(ratings_df[:5])
# ratings_user_id = pd.merge(ratings_df['userId'], unique_user_id,
#                             left_on='userId', right_on='userId', how='left')
# print(ratings_user_id[:5])
# ratings_user_id = torch.from_numpy(ratings_user_id['mappedID'].values)
# print(ratings_user_id[:5])

# print(train_loader.data)
# print(data["user", "rates", "movie"].edge_index)

# print(len(val_data["user", "rates", "movie"].edge_label))
# print(val_data["user", "rates", "movie"].edge_label_index)
# print(val_data["user", "rates", "movie"].edge_label_index[0][15122:15128])
# val_data["user", "rates", "movie"].edge_label_index[1][15122:15128]

# val_data["user", "rates", "movie"].edge_label[11000:11050]

# From Val_data just group_by user_id, this will be ground truth, add X random neg samples from items where edge_label=0
# concatenate two list, load this data with LinkNeighborLoader, pass it to model and return the preds
user_id = 11

edge_index = val_data['user', 'rates', 'movie']['edge_label_index']
edge_label = val_data['user', 'rates', 'movie']['edge_label']

mask = edge_index[0] == user_id
filtered_edge_index = edge_index[:, mask]
filtered_edge_label = edge_label[mask]

# To create the val_ap_data we need to pass the movie_feat, node_ids. the movie_id in movie_feat is equal to movie_node_ids
val_ap_data = HeteroData()
val_ap_data["user"].node_id = filtered_edge_index[0]
val_ap_data["movie"].node_id = filtered_edge_index[1]
# Add the node features and edge indices:

val_ap_data["movie"].x = movie_feat[??, :]
val_ap_data["user", "rates", "movie"].edge_index = edge_index_user_to_movie
# We also need to make sure to add the reverse edges from movies to users
# in order to let a GNN be able to pass messages in both directions.
# We can leverage the `T.ToUndirected()` transform for this from PyG:
val_ap_data = T.ToUndirected()(val_ap_data)


# val_ap_loader = LinkNeighborLoader(
#     data=val_data,
#     num_neighbors=[20, 10],
#     edge_label_index=filtered_edge_index,
#     edge_label=filtered_edge_label,
#     shuffle=False,
# )
# sampled_data = next(iter(val_ap_loader))

# preds = []
# ground_truths = []
# for sampled_data in tqdm.tqdm(val_loader):
#     with torch.no_grad():
#         sampled_data.to(device)
#         preds.append(model(sampled_data))
#         ground_truths.append(sampled_data["user", "rates", "movie"].edge_label)
# pred = torch.cat(preds, dim=0).cpu().numpy()
# ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()