In [11]:
import dgl
from dgl.data import DGLDataset
import torch
import os
import pandas as pd
import numpy as np
import torch
import itertools
import numpy as np
import scipy.sparse as sp
import dgl.nn as dglnn
import torch.nn as nn
import torch.nn.functional as F
import dgl.function as fn
from dgl.nn.pytorch import conv as dgl_conv
from sklearn.metrics import roc_auc_score
from startup_data_set import COMP4222Dataset
device = torch.device('cpu')

In [2]:
dataset = COMP4222Dataset()
graph = dataset[0]
print(graph)

Graph(num_nodes=25446, num_edges=45621,
      ndata_schemes={'feat': Scheme(shape=(221,), dtype=torch.float64), 'label': Scheme(shape=(), dtype=torch.float32)}
      edata_schemes={'feat': Scheme(shape=(26,), dtype=torch.float64)})


In [3]:
in_feats = graph.ndata['feat'].shape[1]

In [4]:
u, v = graph.edges()
# give id for all edges then permutation
eids = np.arange(graph.number_of_edges())
eids = np.random.permutation(eids)

# use 10% as test set
test_size = int(len(eids) * 0.1)
train_size = graph.number_of_edges() - test_size

test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]

# Find all negative edges and split them for training and testing

#use sparse matrix to save memory
# ,shape = (torch.max(v)+1,torch.max(v)+1)
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))
adj_neg = 1 - adj.todense() - np.eye(torch.max(u)+1,torch.max(v)+1)
neg_u, neg_v = np.where(adj_neg != 0) # negative edge, we don't have edge

neg_eids = np.random.choice(len(neg_u), graph.number_of_edges())
test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]
train_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]]

In [5]:
# Debug

In [6]:
train_g = dgl.remove_edges(graph, eids[:test_size])
train_g = dgl.add_self_loop(train_g)
train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=graph.number_of_nodes())
train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=graph.number_of_nodes())

test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=graph.number_of_nodes())
test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=graph.number_of_nodes())

In [22]:
class GraphSAGEModel(nn.Module):
    def __init__(self,
                 in_feats,
                 n_hidden,
                 out_dim,
                 n_layers,
                 activation,
                 dropout,
                 aggregator_type):
        super(GraphSAGEModel, self).__init__()

        self.layers = nn.ModuleList()
        # input layer
        self.layers.append(dgl_conv.SAGEConv(in_feats, n_hidden, aggregator_type,
                                         feat_drop=dropout, activation=activation))
        # hidden layers
        for i in range(n_layers - 1):
            self.layers.append(dgl_conv.SAGEConv(n_hidden, n_hidden, aggregator_type,
                                             feat_drop=dropout, activation=activation))
        # output layer
        self.layers.append(dgl_conv.SAGEConv(n_hidden, out_dim, aggregator_type,
                                         feat_drop=dropout, activation=None))
        
    def forward(self, g, features):
        h = features.float()
        for layer in self.layers:
            h = layer(g, h).float()
        return h

In [24]:
# Hyperparameters
n_hidden = 64
n_layers = 2
dropout = 0.5
aggregator_type = 'mean'

gconv_model = GraphSAGEModel(in_feats,
                             n_hidden,
                             n_hidden,
                             n_layers,
                             F.relu,
                             dropout,
                             aggregator_type)

In [25]:
pred = DotPredictor()

def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
    return F.binary_cross_entropy_with_logits(scores, labels)

def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).numpy()
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    return roc_auc_score(labels, scores)

In [26]:
import itertools

optimizer = torch.optim.Adam(itertools.chain(gconv_model.parameters(), pred.parameters()), lr=0.01)
for e in range(150):
    # forward
    h = gconv_model(train_g, train_g.ndata['feat'])
    pos_score = pred(train_pos_g, h)
    neg_score = pred(train_neg_g, h)
    loss = compute_loss(pos_score, neg_score)

    # backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if e % 5 == 0:
        print('In epoch {}, loss: {}'.format(e, loss))

from sklearn.metrics import roc_auc_score
with torch.no_grad():
    pos_score = pred(test_pos_g, h)
    neg_score = pred(test_neg_g, h)
    print('AUC', compute_auc(pos_score, neg_score))

In epoch 0, loss: 9.045723915100098
In epoch 5, loss: 0.8431828022003174
In epoch 10, loss: 0.6943535208702087
In epoch 15, loss: 0.6908046007156372
In epoch 20, loss: 0.6906868815422058
In epoch 25, loss: 0.6911425590515137
In epoch 30, loss: 0.6902279257774353
In epoch 35, loss: 0.6895298361778259
In epoch 40, loss: 0.68839031457901
In epoch 45, loss: 0.6869620680809021
In epoch 50, loss: 0.6846441626548767
In epoch 55, loss: 0.6784398555755615
In epoch 60, loss: 0.6762485504150391
In epoch 65, loss: 0.6652789115905762
In epoch 70, loss: 0.6603643894195557
In epoch 75, loss: 0.6487939953804016
In epoch 80, loss: 0.640209436416626
In epoch 85, loss: 0.6273946762084961
In epoch 90, loss: 0.6154678463935852
In epoch 95, loss: 0.6054980754852295
In epoch 100, loss: 0.5993087887763977
In epoch 105, loss: 0.5985466241836548
In epoch 110, loss: 0.5821447372436523
In epoch 115, loss: 0.581139326095581
In epoch 120, loss: 0.5762310028076172
In epoch 125, loss: 0.5707046985626221
In epoch 130,