### LightGCN implementation for recommendation

In [18]:
#!python -m pip install torch-scatter -f https://data.pyg.org/whl/torch-2.2.2%2Bcu121.html
#!pip install torch-sparse -f https://data.pyg.org/whl/torch-1.10.0+cu111.html
#!pip install torch-geometric

import torch
import pandas as pd
torch.manual_seed(42)

<torch._C.Generator at 0x10647a9b0>

In [9]:
from torch_geometric.data import Data

class BipartiteData(Data):
    def __init__(self, edge_index_u2a=None, edge_index_a2u=None, num_artists=None, num_users=None):
        super().__init__()
        self.edge_index_u2a = edge_index_u2a
        self.edge_index_a2u = edge_index_a2u
        self.num_users = num_users
        self.num_artists = num_artists

    def __inc__(self, key, value, *args, **kwargs):
        # Returns the incremental count to cumulatively increase the value
        # of the next attribute of :obj:`key` when creating batches.
        if key == 'edge_index_u2a':
            return torch.tensor([[self.num_users], [self.num_artists]])
        elif key == 'edge_index_a2u':
            return torch.tensor([[self.num_artists], [self.num_users]])
        else:
            return super(BipartiteData, self).__inc__(key, value)

In [12]:
#from src.get_pyg_data import load_bipartitedata
import torch.nn as nn
#import torch_scatter
from torch_geometric.nn.conv import MessagePassing

In [13]:
class LightGCN(MessagePassing):
    def __init__(self, **kwargs):
        super(LightGCN, self).__init__(node_dim=0, **kwargs)

    def forward(self, x, edge_index, size=None):
        return self.propagate(edge_index=edge_index, x=(x[0], x[1]), size=size)

    def message(self, x_j):
        return x_j

    def aggregate(self, inputs, index, dim_size=None):
        return torch_scatter.scatter(src=inputs, index=index, dim=0, dim_size=dim_size, reduce='mean')

In [14]:
class LightGCNStack(torch.nn.Module):
    def __init__(self, args):
        super(LightGCNStack, self).__init__()
        self.latent_dim = args.latent_dim
        self.num_layers = args.num_layers
        self.dataset = None
        self.embeddings_users = None
        self.embeddings_artists = None
        self.lambda_reg = args.lambda_reg

        conv_model = LightGCN
        self.convs = nn.ModuleList()
        self.convs.append(conv_model())
        assert (args.num_layers >= 1), 'Number of layers is not >=1'
        for l in range(args.num_layers-1):
            self.convs.append(conv_model())

    def reset_parameters(self):
        self.embeddings.reset_parameters()

    def init_data(self, dataset):
        self.dataset = dataset
        self.embeddings_users = torch.nn.Embedding(num_embeddings=dataset.num_users, embedding_dim=self.latent_dim).to('cuda')
        self.embeddings_artists = torch.nn.Embedding(num_embeddings=dataset.num_artists, embedding_dim=self.latent_dim).to('cuda')

    def forward(self):
        x_users, x_artists = self.embeddings_users.weight, self.embeddings_artists.weight
                                                
        final_embeddings_users = torch.zeros(size=x_users.size(), device='cuda')
        final_embeddings_artists = torch.zeros(size=x_artists.size(), device='cuda')
        final_embeddings_users = final_embeddings_users + x_users/(self.num_layers + 1)
        final_embeddings_artists = final_embeddings_artists + x_artists/(self.num_layers+1)
        for i in range(self.num_layers):
            x_users = self.convs[i]((x_artists, x_users), self.dataset.edge_index_a2u, size=(self.dataset.num_artists, self.dataset.num_users))
            x_artists = self.convs[i]((x_users, x_artists), self.dataset.edge_index_u2a, size=(self.dataset.num_users, self.dataset.num_artists))
            final_embeddings_users = final_embeddings_users + x_users/(self.num_layers+1)
            final_embeddings_artists = final_embeddings_artists + x_artists/(self.num_layers + 1)

        return final_embeddings_users, final_embeddings_artists

    
    def decode(self, z1, z2, pos_edge_index, neg_edge_index):  
        ''' 
        Getting recommendation scores for the edges in pos_edge_index and neg_edge_index.
        z1 and z2 are torch.nn.Embeddings objects. If edge index is of form 
        (user, artist) then z1 will be user embedding matrix and z2 will be 
        artist embedding matrix, else the parameters are flipped. 
        '''
        edge_index = torch.cat([pos_edge_index, neg_edge_index], dim=-1)  # concatenate pos and neg edges
        logits = (z1[edge_index[0]] * z2[edge_index[1]]).sum(dim=-1)  # dot product
        return logits

    def decode_all(self, z_users, z_artists):
        '''
        Get ranking score matrix for all combinations of users and artists
        '''
        prob_adj = z_users @ z_artists.t() # dot product between all combinations
        return prob_adj

    def BPRLoss(self, prob_adj, real_adj, edge_index):
        '''
        Custom written BPR Loss function. It uses full-batch calculation, so it 
        requires a lot of resources and does not scale for very large graphs. 
        For our dataset, it will do.

        prob_adj: NxM ranking score matrix for all users and artists
        real_adj: Real adjacency matrix of type scipy.sparse.coo_matrix
        edge_index: index of graph edges
        '''
        loss = 0
        pos_scores = prob_adj[edge_index.cpu().numpy()]
        for pos_score, node_index in zip(pos_scores, edge_index[0]):
            neg_scores = prob_adj[node_index, real_adj[node_index] == 0]
            loss = loss - torch.sum(torch.log(torch.sigmoid(pos_score.repeat(neg_scores.size()[0]) - neg_scores))) / \
                   neg_scores.size()[0]

        loss += self.lambda_reg*(torch.pow(torch.norm(self.embeddings_users.weight, dim=None), 2) +
                                 torch.pow(torch.norm(self.embeddings_artists.weight), 2))

        return loss

    def topN(self, user_id, n):
        '''
        Get indices of top N recommendations for user with ID user_id based on 
        ranking scores.
        '''
        z_users, z_artists = self.forward()
        scores = torch.squeeze(z_users[user_id] @ z_artists.t())
        return torch.topk(scores, k=n)

In [15]:
import scipy
from torch_geometric.utils import negative_sampling

def to_scipy_sparse_matrix(edge_index, num_nodes):
    row, col = edge_index.cpu()
    edge_attr = torch.ones(row.size(0))
    out = scipy.sparse.coo_matrix(
        (edge_attr.numpy(), (row.numpy(), col.numpy())), (num_nodes[0], num_nodes[1]))
    return out

def train(model, data, optimizer):
    model.train()
    data.neg_edge_index_u2a = negative_sampling(
        edge_index=data.edge_index_u2a,  # positive edges
        num_nodes=(data.num_users, data.num_artists),  # number of nodes
        num_neg_samples=data.edge_index_u2a.size(1),
        method='sparse').to('cuda')  # number of neg_sample equal to number of pos_edges

    optimizer.zero_grad()

    z_users, z_artists = model.forward()  # encode
    loss = model.BPRLoss(model.decode_all(z_users, z_artists),
                         to_scipy_sparse_matrix(data.edge_index_u2a, num_nodes=(data.num_users, data.num_artists)).toarray(),
                         data.edge_index_u2a)

    loss.backward()
    optimizer.step()

    return loss

In [16]:
class objectview(object):
    def __init__(self, *args, **kwargs):
        d = dict(*args, **kwargs)
        self.__dict__ = d

In [19]:
import time

# Wrapper for evaluation
class LightGCN_recommender:
    def __init__(self, args):
        self.args = objectview(args)
        self.model = LightGCNStack(args=self.args).to('cuda')
        self.a_rev_dict = None
        self.u_rev_dict = None
        self.a_dict = None
        self.u_dict = None

    def fit(self, data: pd.DataFrame):
        # Default rankings when userID is not in training set
        self.default_recommendation = data["item_id"].value_counts().index.tolist()

        # LightGCN
        data, self.u_rev_dict, self.a_rev_dict, self.u_dict, self.a_dict = load_bipartitedata(data)
        data = data.to("cuda")
        self.model.init_data(data)
        self.optimizer = torch.optim.Adam(params=self.model.parameters(), lr=0.001)

        best_val_perf = test_perf = 0

        for epoch in range(1, self.args.epochs+1):
            start = time.time()
            train_loss = train(self.model, data, self.optimizer)
            log = 'Epoch: {:03d}, Loss: {:.4f}, Elapsed time: {:.2f}'
            print(log.format(epoch, train_loss, time.time()-start))

    def recommend(self, user_id, n):
        try:
            recommendations = self.model.topN(self.u_dict[str(user_id)], n=n)
        except KeyError:

            recommendations = self.default_recommendation
        else:
            recommendations = recommendations.indices.cpu().tolist()
            recommendations = list(map(lambda x: self.a_rev_dict[x], recommendations))
        return recommendations

In [20]:
from functools import partial

args = {'model_type': 'LightGCN', 'num_layers': 3, 'latent_dim': 32,
         'dropout': 0, 'epochs': 100, 'opt': 'adam', 'opt_scheduler': 'none', 'opt_restart': 0, 'weight_decay': 5e-3,
         'lr': 0.1, 'lambda_reg': 1e-4}
