In [1]:
import sys  
sys.path.insert(0, '/Users/pratikaher/DGL/GRecSy/')

from random import sample
import dgl
import torch
import torch.nn.functional as F
import pandas as pd
from utils import set_random_seed, collate_molgraphs, load_model, collate_movie_graphs, train_test_split_by_time, _split_data
import numpy as np
import torch
import torch.nn as nn
import argparse
from configure import get_exp_configure
import scipy.sparse as sp
from torch.utils.data import DataLoader

from dgllife.utils import EarlyStopping, Meter
from model import compute_loss

df_rating = pd.read_csv('/Users/pratikaher/DGL/graph-rec/ml-1m/ratings.dat', sep='::', header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'],  engine='python')
df_user = pd.read_csv('/Users/pratikaher/DGL/graph-rec/ml-1m/users.dat', sep='::', header=None, names=['user_id', 'gender', 'age', 'occupation', 'zipcode'],  engine='python')
df_movie = pd.read_csv('/Users/pratikaher/DGL/graph-rec/ml-1m/movies.dat', sep='::', header=None, names=['movie_id', 'title', 'genre'],  engine='python')

df_temp = df_rating.merge(df_movie, left_on='movie_id', right_on='movie_id', how='left')
df_final = df_temp.merge(df_user, left_on='user_id', right_on='user_id', how='left')

df_final = df_final[["user_id","movie_id","rating","age"]]

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_final.tail()

Unnamed: 0,user_id,movie_id,rating,age
1000204,6040,1091,1,25
1000205,6040,1094,5,25
1000206,6040,562,5,25
1000207,6040,1096,4,25
1000208,6040,1097,4,25


# Prepare Dataloaders

In [3]:
df_final['movie_id'] = df_final['movie_id'] + 6040

In [6]:
g = dgl.graph((df_final['user_id'].to_numpy(), df_final['movie_id'].to_numpy()))

isolated_nodes = ((g.in_degrees() == 0) & (g.out_degrees() == 0)).nonzero().squeeze(1)
g = dgl.remove_nodes(g, isolated_nodes)

g.edata['rating'] = torch.unsqueeze(torch.tensor(df_final['rating'].to_numpy()), dim = 1)
g.ndata['age'] = torch.randn(9746, 1).float()
# g.ndata['age'] = torch.unsqueeze(torch.tensor(df_user['age'].to_numpy()).float() / 100, dim = 1)

In [7]:
g

Graph(num_nodes=9746, num_edges=1000209,
      ndata_schemes={'age': Scheme(shape=(1,), dtype=torch.float32)}
      edata_schemes={'rating': Scheme(shape=(1,), dtype=torch.int64)})

In [8]:
u, v = g.edges()

eids = np.arange(g.number_of_edges())
eids = np.random.permutation(eids)
test_size = int(len(eids) * 0.1)
train_size = g.number_of_edges() - test_size

In [9]:
test_size, train_size

(100020, 900189)

In [10]:
train_g = dgl.remove_edges(g, eids[:test_size])

In [11]:
sampler = dgl.dataloading.MultiLayerNeighborSampler([15, 10, 5])
dataloader = dgl.dataloading.EdgeDataLoader(
        train_g, train_g.nodes(), 
        sampler, negative_sampler=dgl.dataloading.negative_sampler.Uniform(5), 
        shuffle=True, drop_last=False,
        batch_size=16
        )



In [45]:
test_g = dgl.remove_edges(g, eids[test_size:])

In [56]:
test_g

Graph(num_nodes=9746, num_edges=100020,
      ndata_schemes={'age': Scheme(shape=(1,), dtype=torch.float32)}
      edata_schemes={'rating': Scheme(shape=(1,), dtype=torch.int64)})

In [15]:
test_dataloader = dgl.dataloading.EdgeDataLoader(
        test_g, test_g.nodes(), 
        sampler, negative_sampler=dgl.dataloading.negative_sampler.Uniform(5), 
        shuffle=True, drop_last=False,
        batch_size=16
        )

# Define the GNN Model

In [32]:
from dgllife.model.gnn import MPNNGNN
import dgl.function as fn

class DotProductPredictor(nn.Module):
    def forward(self, graph, h):
        # h contains the node representations computed from the GNN defined
        # in the node classification section (Section 5.1).
        with graph.local_scope():
            graph.ndata['h'] = h
            graph.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            return graph.edata['score']

class MPNNPredictor(nn.Module):
    def __init__(self,
                 node_in_feats,
                 edge_in_feats,
                 node_out_feats=64,
                 edge_hidden_feats=128,
                 n_tasks=1,
                 num_step_message_passing=6):
        super(MPNNPredictor, self).__init__()

        self.gnn = MPNNGNN(node_in_feats=node_in_feats,
                           node_out_feats=node_out_feats,
                           edge_in_feats=edge_in_feats,
                           edge_hidden_feats=edge_hidden_feats,
                           num_step_message_passing=num_step_message_passing)
        self.predictor = DotProductPredictor()
    
    def forward(self, g, node_feats, edge_feats, node_subgraph_negative):
        ```
        ```
        node_feats = self.gnn(g, node_feats, edge_feats)
        return self.predictor(g, node_feats), self.predictor(node_subgraph_negative, node_feats)
    
    def get_repr(self, g, node_feats, edge_feats):
        """Returns the embedded representation given block made from sampling neighboring nodes."""
        
        node_feats = self.gnn(g, node_feats, edge_feats)
        return node_feats
        
#         # project features
#         h_item = self.project_node_feats(node_feats_src)
#         # node's own learnable embedding
#         h_item_dst = self.proj(node_feats_dst)

        # embedding + GNN output
#         return h_item_dst + self.sage(blocks, h_item)

In [33]:
mpnn_model = MPNNPredictor(train_g.ndata['age'].shape[1], train_g.edata['rating'].shape[1])

In [34]:
optimizer = torch.optim.Adam(mpnn_model.parameters(), lr=0.0001,weight_decay=0)

In [35]:
def compute_loss(pos_score, neg_score):
    pos_score, neg_score = pos_score.squeeze(), neg_score.squeeze()
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
    return F.binary_cross_entropy_with_logits(scores, labels)

# def compute_loss(pos_score, neg_score):
#     # Margin loss
#     n_edges = pos_score.shape[0]
#     return (1 - pos_score + neg_score.view(n_edges, -1)).clamp(min=0).mean()

In [36]:
def run_a_train_epoch(epoch, model, dataloader, optimizer):
    
    total_loss = 0
    batch_number = 0
    
    for _, pos_g, neg_g, blocks in dataloader:
        
        h = pos_g.ndata.pop('age')
        e = pos_g.edata.pop('rating')
        h, e = h.to('cpu', dtype=torch.float), e.to('cpu', dtype=torch.float)

        pos_score, neg_score = model(pos_g, h, e, neg_g)
        loss = compute_loss(pos_score, neg_score)

        if epoch > 0:  # For the epoch 0, no training (just report loss)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
        total_loss += loss.item()
        batch_number += 1

    train_avg_loss = total_loss / batch_number
    
    return total_loss

## Train the model

In [37]:
for epoch in range(2):
    
    average_loss = run_a_train_epoch(epoch, mpnn_model, dataloader, optimizer)
    print("Epoch :{} has a average loss of : {}".format(epoch, average_loss))

Epoch :0 has a average loss of : 570.2678933143616
Epoch :1 has a average loss of : 404.3635984659195


# Save Down Models

In [60]:
# use this to resume training for models
state = {
        'model_state_dict': mpnn_model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
#         'loss': total_val_loss,
#         'item_embeddings': h_item,
        'batch_size': 16
            }
import os
model_dir = '/Users/pratikaher/DGL/models/'
model_fn = "{}_model_{}.pth".format('gnn_trained_model', 10)
torch.save(state, os.path.join(model_dir, model_fn))

In [70]:
# use this to save down entire models
torch.save(mpnn_model, os.path.join(model_dir, 'mpnn_model_save.pth'))

# Then later:
mpnn_model_loaded = torch.load(os.path.join(model_dir, 'mpnn_model_save.pth'))

In [71]:
mpnn_model_loaded

MPNNPredictor(
  (gnn): MPNNGNN(
    (project_node_feats): Sequential(
      (0): Linear(in_features=1, out_features=64, bias=True)
      (1): ReLU()
    )
    (gnn_layer): NNConv(
      (edge_func): Sequential(
        (0): Linear(in_features=1, out_features=128, bias=True)
        (1): ReLU()
        (2): Linear(in_features=128, out_features=4096, bias=True)
      )
    )
    (gru): GRU(64, 64)
  )
  (predictor): DotProductPredictor()
)

TODOs

- Find a way to test the model
- Retrieve embeddings from the model


# Compute Test Scores

In [78]:
# compute loss on test set
from sklearn.metrics import roc_auc_score

def compute_test_score(mpnn_model, test_dataloader):
    
    total_loss = 0
    
    with torch.no_grad():
        for _, pos_g, neg_g, blocks in test_dataloader:
            
            h = pos_g.ndata.pop('age')
            e = pos_g.edata.pop('rating')
            h, e = h.to('cpu', dtype=torch.float), e.to('cpu', dtype=torch.float)

            pos_score, neg_score = mpnn_model(pos_g, h, e, neg_g)
            loss = compute_loss(pos_score, neg_score)

            total_loss += loss.item()

    return total_loss
    

In [79]:
compute_test_score(mpnn_model, test_dataloader)

238.26571887731552

In [27]:
# sampler = dgl.dataloading.MultiLayerNeighborSampler([15, 10, 5])
# nodeloader_test = dgl.dataloading.NodeDataLoader(
#         test_g,
#         test_g.nodes(),
#         sampler,
#         batch_size=16,
#         shuffle=True,
#         drop_last=False
#     )

In [30]:
# for input_nodes, output_nodes, blocks in nodeloader_test:
# #     print(input_nodes, blocks)
#     for block in blocks:
#         graph = dgl.block_to_graph(block)
#         print(graph)
#         break
#     break

Graph(num_nodes={'_N_src': 47, '_N_dst': 37},
      num_edges={('_N_src', '_E', '_N_dst'): 22},
      metagraph=[('_N_src', '_N_dst', '_E')])


# Get embeddings of input nodes from the trained model

In [47]:
test_node_features = test_g.ndata['age']
test_edge_features = test_g.edata['rating']

In [52]:
test_node_features, test_edge_features = test_node_features.to('cpu', dtype=torch.float), test_edge_features.to('cpu', dtype=torch.float)
representation_embeddings = mpnn_model.get_repr(test_g, test_node_features, test_edge_features)

In [54]:
representation_embeddings

tensor([[-0.0113,  0.0763, -0.0321,  ..., -0.0382,  0.0353,  0.1068],
        [ 0.0434, -0.0375, -0.0216,  ...,  0.0040, -0.0154, -0.1385],
        [ 0.0433, -0.0204, -0.0159,  ..., -0.0216,  0.0093, -0.0553],
        ...,
        [ 1.0000,  1.0000, -1.0000,  ..., -1.0000,  1.0000,  1.0000],
        [ 1.0000,  0.9999, -1.0000,  ..., -1.0000,  1.0000, -0.8890],
        [ 1.0000,  1.0000, -1.0000,  ..., -1.0000,  1.0000,  1.0000]],
       grad_fn=<SqueezeBackward1>)