Data :    
- https://graphsandnetworks.com/the-cora-dataset/


Reference : 
- https://www.kaggle.com/code/mhyodo/en-jp-graphneuralnetwork-basictask
- https://github.com/siqim/Machine-Learning-with-Graphs/blob/697d83bb206be0825ebaf0dad128b9eb24908705/examples/2_GCN/GCN.py
- https://github.com/tkipf/pygcn/blob/master/pygcn/utils.py

In [14]:
!pip install dgl-cu110 dglgo -f https://data.dgl.ai/wheels/repo.html &>/dev/null

In [28]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
import numpy as np 
import pandas as pd 
from matplotlib import pyplot as plt
import dgl
from dgl.data import DGLDataset
import dgl.nn as dglnn
import networkx as nx
import scipy.sparse as sp


In [8]:
# read data
content_path = "/content/drive/MyDrive/cora/cora.content"
cite_path = "/content/drive/MyDrive/cora/cora.cites"

with open(content_path, "r") as fp:
    contents = fp.readlines()
with open(cite_path, "r") as fp:
    cites = fp.readlines()

contents = np.array([np.array(l.strip().split("\t")) for l in contents])
paper_list, feat_list, label_list = np.split(contents, [1,-1], axis=1)
paper_list, label_list = np.squeeze(paper_list), np.squeeze(label_list)
# Paper -> Index dict
paper_dict = dict([(key, val) for val, key in enumerate(paper_list)])
# Label -> Index dict
labels = list(set(label_list))
label_dict = dict([(key, val) for val, key in enumerate(labels)])
# Edge_index
cites = [i.strip().split("\t") for i in cites]
cites = np.array([[paper_dict[i[0]], paper_dict[i[1]]] for i in cites], 
                 np.int64).T   # (2, edge)
cites = np.concatenate((cites, cites[::-1, :]), axis=1)  # (2, 2*edge) or (2, E)
# Degree
_, degree_list = np.unique(cites[0,:], return_counts=True)
# Input
node_num = len(paper_list)
feat_dim = feat_list.shape[1]
stat_dim = 32
num_class = len(labels)
T = 2
feat_Matrix = torch.Tensor(feat_list.astype(np.float32))
X_Node, X_Quote = np.split(cites, 2, axis=0)
X_Node, X_Quote = torch.from_numpy(np.squeeze(X_Node)), \
                 torch.from_numpy(np.squeeze(X_Quote))
label_list = np.array([label_dict[i] for i in label_list])
label_list = torch.from_numpy(label_list)
print('Node:', X_Node) # paper id
print('Quote:', X_Quote) # quote paper id
print('label:', label_list) # label (paper genre)
print('feat:', feat_Matrix) # feat

Node: tensor([ 163,  163,  163,  ..., 2258, 1887, 1686])
Quote: tensor([ 402,  659, 1696,  ..., 1887, 1902,  837])
label: tensor([2, 4, 5,  ..., 1, 0, 2])
feat: tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])


- https://github.com/Uroboros0313/GraphEmbedding_Pytorch/blob/f9cd24893f1cfd934d9e71e22b0cbaeeb69a9383/utils/data_utils.py

In [11]:
# G = nx.Graph()
# content_path = "/content/drive/MyDrive/cora/cora.content"
# cite_path = "/content/drive/MyDrive/cora/cora.cites"

# with open(cite_path, 'r', encoding='utf-8') as f:
#     lines = f.readlines()
#     edge_list = [tuple(edge_info.strip().split('\t')) for edge_info in lines]
#     G.add_edges_from(edge_list)

# with open(content_path, 'r', encoding='utf-8') as f:
#     lines = f.readlines()
#     nodes = []
#     feats = []
#     labels = []
#     for feature_info in lines:
#         features = feature_info.strip().split('\t')
#         nodes.append(features[0])
#         feats.append([int(col_feat) for col_feat in features[1: -1]])
#         labels.append(features[-1])

# unique_labels = list(set(labels))
# label_map = {label: i for i, label in enumerate(unique_labels)}   

# for node, feat, label in zip(nodes, feats, labels):
#     G.nodes[node]['feature'] = np.asarray(feat)
#     G.nodes[node]['label'] = label_map[label]
    

In [16]:
g = dgl.graph((X_Node, X_Quote), num_nodes=node_num)
g.ndata['feat'] = feat_Matrix
g.ndata['label'] = label_list

In [17]:
n_nodes = node_num
n_train = int(n_nodes * 0.6)
n_val = int(n_nodes * 0.2)
train_mask = torch.zeros(n_nodes, dtype=torch.bool)
val_mask = torch.zeros(n_nodes, dtype=torch.bool)
test_mask = torch.zeros(n_nodes, dtype=torch.bool)
train_mask[:n_train] = True
val_mask[n_train:n_train + n_val] = True
test_mask[n_train + n_val:] = True
g.ndata['train_mask'] = train_mask
g.ndata['val_mask'] = val_mask
g.ndata['test_mask'] = test_mask

In [20]:
print('Graph summary')
print(g)
print('Node features')
print(g.ndata)
print('Edge features')
print(g.edata)

Graph summary
Graph(num_nodes=2708, num_edges=10858,
      ndata_schemes={'feat': Scheme(shape=(1433,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int64), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={})
Node features
{'feat': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]), 'label': tensor([2, 4, 5,  ..., 1, 0, 2]), 'train_mask': tensor([ True,  True,  True,  ..., False, False, False]), 'val_mask': tensor([False, False, False,  ..., False, False, False]), 'test_mask': tensor([False, False, False,  ...,  True,  True,  True])}
Edge features
{}


In [27]:
# https://github.com/tkipf/pygcn/blob/master/pygcn/utils.py
# load data from cora 
content_path = "/content/drive/MyDrive/cora/cora.content"
cite_path = "/content/drive/MyDrive/cora/cora.cites"

idx_features_labels = np.genfromtxt(content_path, dtype=np.dtype(str))

In [33]:
idx_features_labels[0]

array(['31336', '0', '0', ..., '0', '0', 'Neural_Networks'], dtype='<U22')

In [58]:
features = sp.csr_matrix(idx_features_labels[:, 1:-1], dtype=np.float32)
classes = set(idx_features_labels[:, -1])
classes_dict = {c: np.identity(len(classes))[i, :] for i, c in
                enumerate(classes)}
labels = np.array(list(map(classes_dict.get, idx_features_labels[:, -1])),
                          dtype=np.int32)

In [59]:
features[0], classes, labels

(<1x1433 sparse matrix of type '<class 'numpy.float32'>'
 	with 1433 stored elements in Compressed Sparse Row format>,
 {'Case_Based',
  'Genetic_Algorithms',
  'Neural_Networks',
  'Probabilistic_Methods',
  'Reinforcement_Learning',
  'Rule_Learning',
  'Theory'},
 array([[0, 0, 1, ..., 0, 0, 0],
        [0, 0, 0, ..., 1, 0, 0],
        [0, 0, 0, ..., 0, 1, 0],
        ...,
        [0, 1, 0, ..., 0, 0, 0],
        [1, 0, 0, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 0]], dtype=int32))

In [37]:
# build graph 
idx = np.array(idx_features_labels[:, 0], dtype=np.int32)
idx_map = {j: i for i, j in enumerate(idx)}
print(idx)

[  31336 1061127 1106406 ... 1128978  117328   24043]


In [43]:
edges_unordered = np.genfromtxt(cite_path, dtype=np.int32)
edges_unordered[0], edges_unordered[1]

(array([  35, 1033], dtype=int32), array([    35, 103482], dtype=int32))

In [61]:
edges = np.array(list(map(idx_map.get, edges_unordered.flatten())), dtype=np.int32).reshape(edges_unordered.shape)
edges[0], edges[1]

(array([163, 402], dtype=int32), array([163, 659], dtype=int32))

In [62]:
adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])), shape=(labels.shape[0], labels.shape[0]), dtype=np.float32)

In [63]:
adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)

In [64]:
def normalize(mx):
    """Row-normalize sparse matrix"""
    rowsum = np.array(mx.sum(1))
    r_inv = np.power(rowsum, -1).flatten()
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv)
    mx = r_mat_inv.dot(mx)
    return mx
    
features = normalize(features) 
adj = normalize(adj + sp.eye(adj.shape[0])) 

In [69]:
idx_train, idx_val, idx_test = range(140), range(200,500), range(500, 1500) 
idx_train = torch.LongTensor(idx_train) 
idx_val = torch.LongTensor(idx_val) 
idx_test = torch.LongTensor(idx_test)

In [67]:
def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    """Convert a scipy sparse matrix to a torch sparse tensor."""
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape)

features = torch.FloatTensor(np.array(features.todense()))
labels = torch.LongTensor(np.where(labels)[1]) 
adj = sparse_mx_to_torch_sparse_tensor(adj)

In [71]:
adj, features, labels

(tensor(indices=tensor([[   0,    8,   14,  ..., 1389, 2344, 2707],
                        [   0,    0,    0,  ..., 2707, 2707, 2707]]),
        values=tensor([0.1667, 0.1667, 0.0500,  ..., 0.2000, 0.5000, 0.2500]),
        size=(2708, 2708), nnz=13264, layout=torch.sparse_coo),
 tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]),
 tensor([2, 4, 5,  ..., 1, 0, 2]))

In [73]:
adj.shape, features.shape, labels.shape

(torch.Size([2708, 2708]), torch.Size([2708, 1433]), torch.Size([2708]))

In [32]:
class GCN(nn.Module):
    def __init__(self, in_features, hidden_features, out_features):
        super(GCN, self).__init__()
        self.conv1 = nn.Linear(in_features, hidden_features)
        self.conv2 = nn.Linear(hidden_features, out_features)

    def forward(self, x, adj):
        x = F.relu(self.conv1(x))
        x = F.dropout(x, training=self.training)
        x = self.conv2(x)
        return F.log_softmax(x, dim=1)


In [98]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN(g.ndata['feat'].shape[1], 128, len(g.ndata['label'].unique())).to(device)

In [99]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

In [25]:
g

Graph(num_nodes=2708, num_edges=10858,
      ndata_schemes={'feat': Scheme(shape=(1433,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int64), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={})

In [84]:
def cal_accuracy(output, labels):
    preds = output.max(1)[1].type_as(labels)
    correct = preds.eq(labels).double()
    correct = correct.sum()
    return correct / len(labels)

In [103]:
def train(epoch, features, adj, labels, idx_train):
    model.train()
    adj = adj.to(device) 
    features = features.to(device) 
    labels = labels.to(device)
    optimizer.zero_grad()
    output = model(features, adj)
    loss = criterion(output[idx_train], labels[idx_train])
    loss.backward()
    optimizer.step()
    accuracy = cal_accuracy(output[idx_train], labels[idx_train])
    if epoch % 10 == 0: 
      print("Epoch {0} Loss: {1} Accuracy {2}".format(epoch, loss.item(), accuracy.item()))

def evaluate(features, adj, labels, idx_test):
    model.eval()
    adj = adj.to(device) 
    features = features.to(device) 
    labels = labels.to(device)

    with torch.no_grad():
        output = model(features, adj)
        pred = output.argmax(dim=1)
    loss = criterion(output[idx_test], labels[idx_test])
    accuracy = cal_accuracy(output[idx_test], labels[idx_test])
    print("[Test] Loss: {0} Accuracy {1}".format(loss.item(), accuracy.item()))


In [105]:
for epoch in range(100): 
  train(epoch, features, adj, labels, idx_train)
  if epoch % 30 == 0: 
    evaluate(features, adj, labels, idx_test)

Epoch 0 Loss: 0.07838965207338333 Accuracy 1.0
[Test] Loss: 1.2637449502944946 Accuracy 0.583
Epoch 10 Loss: 0.08204708993434906 Accuracy 1.0
Epoch 20 Loss: 0.06400196999311447 Accuracy 1.0
Epoch 30 Loss: 0.08669473975896835 Accuracy 1.0
[Test] Loss: 1.2572790384292603 Accuracy 0.587
Epoch 40 Loss: 0.07743391394615173 Accuracy 1.0
Epoch 50 Loss: 0.06616414338350296 Accuracy 1.0
Epoch 60 Loss: 0.07256202399730682 Accuracy 1.0
[Test] Loss: 1.2550323009490967 Accuracy 0.575
Epoch 70 Loss: 0.07253197580575943 Accuracy 1.0
Epoch 80 Loss: 0.07879054546356201 Accuracy 1.0
Epoch 90 Loss: 0.06413944810628891 Accuracy 1.0
[Test] Loss: 1.249057412147522 Accuracy 0.586
