In [40]:
%load_ext autoreload
%autoreload 2
import gust  # library for loading graph data

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import scipy.sparse as sp
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.distributions as dist
import time

torch.set_default_tensor_type('torch.cuda.FloatTensor')
%matplotlib inline
sns.set_style('whitegrid')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [41]:
# Load the dataset using `gust` library
# graph.standardize() makes the graph unweighted, undirected and selects
# the largest connected component
# graph.unpack() returns the necessary vectors / matrices

A, X, _, y = gust.load_dataset('cora').standardize().unpack()
# A - adjacency matrix 
# X - attribute matrix - not needed
# y - node labels

if (A != A.T).sum() > 0:
    raise RuntimeError("The graph must be undirected!")

if (A.data != 1).sum() > 0:
    raise RuntimeError("The graph must be unweighted!")

In [42]:


num_nodes = A.shape[0]
num_edges = A.sum()

# Convert adjacency matrix to a CUDA Tensor
adj = torch.FloatTensor(A.toarray()).cuda()

In [43]:
torch.manual_seed(123)
# Define the embedding matrix
embedding_dim = 64
emb = nn.Parameter(torch.empty(num_nodes, embedding_dim).normal_(0.0, 1.0))

# Initialize the bias
# The bias is initialized in such a way that if the dot product between two embedding vectors is 0 
# (i.e. z_i^T z_j = 0), then their connection probability is sigmoid(b) equals to the 
# background edge probability in the graph. This significantly speeds up training
edge_proba = num_edges / (num_nodes**2 - num_nodes)
bias_init = np.log(edge_proba / (1 - edge_proba))
b = nn.Parameter(torch.Tensor([bias_init]))

# Regularize the embeddings but don't regularize the bias
# The value of weight_decay has a significant effect on the performance of the model (don't set too high!)
opt = torch.optim.Adam([
    {'params': [emb], 'weight_decay': 1e-7},
    {'params': [b]}],
    lr=1e-2)

In [44]:
# There are many ways to compute the loss / negative log-likelihood of the model
# There are many ways to compute the loss / negative log-likelihood of the model
def compute_loss_v1(adj, emb, b=0.0): 
    """Compute the negative log-likelihood of the Bernoulli model."""
    logits = emb @ emb.t() + b
    loss = F.binary_cross_entropy_with_logits(logits, adj, reduction='none')
    # Since we consider graphs without self-loops, we don't want to compute loss
    # for the diagonal entries of the adjacency matrix.
    # This will kill the gradients on the diagonal.
    loss[np.diag_indices(adj.shape[0])] = 0.0
    return loss.mean()



def compute_loss_gaussian(adj, emb, b=0.0):
    eps=1e-5
    N = adj.shape[0]
    d=64
    e1, e2 = adj.nonzero()
    pdist = ((emb[:, None] - emb[None, :]).pow(2.0).sum(-1) + eps).sqrt()
    neg_term = torch.log(-torch.expm1(-pdist) + 1e-5)
    neg_term[np.diag_indices(N)] = 0.0
    pos_term = -pdist[e1, e2]
    neg_term[e1, e2] = 0.0
    return -(pos_term.sum() + neg_term.sum()) / emb.shape[0]**2



def distance2(adj, emb, b=0.0):
    eps=1e-5
    N = adj.shape[0]
    d=64
    e1, e2 = adj.nonzero()
    pdist = ((emb[:, None] - emb[None, :]).pow(2.0).sum(-1) + eps).sqrt()
    sigdist = 1/(1+torch.exp(-10*(1-pdist))+eps)
    neg_term = torch.log(1-sigdist +eps)
    neg_term[np.diag_indices(N)] = 0.0
    pos_term = torch.log(sigdist+ eps)[e1, e2]
    neg_term[e1,e2] = 0.0
    return -(pos_term.sum() + neg_term.sum()) / emb.shape[0]**2

# In general, it's very important to compute all the losses in a numerically stable way
# (e.g. using the log-sum-exp trick) or use existing library functions

In [45]:
max_epochs = 1000
display_step = 250


compute_loss = compute_loss_v1

for epoch in range(max_epochs):
    opt.zero_grad()
    loss = compute_loss(adj, emb, b)
    loss.backward()
    opt.step()
    # Training loss is printed every display_step epochs
    if epoch % display_step == 0:
        print(f'Epoch {epoch:4d}, loss = {loss.item():.5f}')

Epoch    0, loss = 1.01289
Epoch  250, loss = 0.01116
Epoch  500, loss = 0.00455
Epoch  750, loss = 0.00280


In [30]:
max_epochs = 1000
display_step = 250


compute_loss =  compute_loss_gaussian

for epoch in range(max_epochs):
    opt.zero_grad()
    loss = compute_loss(A, emb, b)
    loss.backward()
    opt.step()
    # Training loss is printed every display_step epochs
    if epoch % display_step == 0:
        print(f'Epoch {epoch:4d}, loss = {loss.item():.5f}')

Epoch    0, loss = 0.01374
Epoch  250, loss = 0.00745
Epoch  500, loss = 0.00675
Epoch  750, loss = 0.00619


In [46]:
max_epochs = 1000
display_step = 1


compute_loss = distance2

for epoch in range(max_epochs):
    opt.zero_grad()
    loss = compute_loss(A, emb, b)
    loss.backward()
    opt.step()
    # Training loss is printed every display_step epochs
    if epoch % display_step == 0:
        print(f'Epoch {epoch:4d}, loss = {loss.item():.5f}')

Epoch    0, loss = 0.01888
Epoch    1, loss = nan
Epoch    2, loss = nan
Epoch    3, loss = nan
Epoch    4, loss = nan
Epoch    5, loss = nan
Epoch    6, loss = nan
Epoch    7, loss = nan
Epoch    8, loss = nan
Epoch    9, loss = nan
Epoch   10, loss = nan
Epoch   11, loss = nan
Epoch   12, loss = nan
Epoch   13, loss = nan
Epoch   14, loss = nan
Epoch   15, loss = nan
Epoch   16, loss = nan
Epoch   17, loss = nan
Epoch   18, loss = nan
Epoch   19, loss = nan
Epoch   20, loss = nan
Epoch   21, loss = nan
Epoch   22, loss = nan
Epoch   23, loss = nan
Epoch   24, loss = nan
Epoch   25, loss = nan
Epoch   26, loss = nan
Epoch   27, loss = nan
Epoch   28, loss = nan
Epoch   29, loss = nan
Epoch   30, loss = nan
Epoch   31, loss = nan


KeyboardInterrupt: 

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import mutual_info_score
from sklearn.metrics import normalized_mutual_info_score
from sklearn.metrics import adjusted_mutual_info_score

if type(emb) is not np.ndarray: 
    emb = emb.cpu().detach().numpy()

X, labels_true = emb, y
n_cluster = len(set(labels_true))
init = np.zeros((n_cluster,embedding_dim))
for i in range(n_cluster):
    init[i,:] = X[np.where(labels_true==i)].mean(axis=0)
kmeans =  KMeans(n_clusters=n_cluster, random_state=0, init= init).fit(X)

labels = kmeans.labels_

print("Mutual Information: %0.3f"
      % mutual_info_score(labels_true, labels))
print("Normalized Mutual Information: %0.3f"
      % normalized_mutual_info_score(labels_true, labels))
print("Adjusted Mutual Information: %0.3f"
      % adjusted_mutual_info_score(labels_true, labels))