In [64]:
!pip install dgl
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import itertools
import numpy as np
import scipy.sparse as sp



In [None]:
!nvcc --version

In [65]:
import pandas as pd
nodes = pd.read_csv('./node.csv')#node URLs
labels = pd.read_csv('./label.csv')#corresponding labels
feat = pd.read_csv('./feat.csv')#corresponding features
edges = pd.read_csv('./edges.csv')#edges in the form source, destination
src=list(edges['src'])
dst=list(edges[' dst'])


In [66]:
#loading unique labels for mapping them to int ids
labelsm = pd.read_csv('./labels.csv')

In [67]:
#mapping node urls to int ids
mapping={}
for i in range(len(nodes['node'])):
  mapping[nodes['node'][i]]=i
for i in range(len(src)):
  src[i]=mapping[src[i].strip()]
for i in range(len(dst)):
  dst[i]=mapping[dst[i].strip()]

In [68]:
#mapping labels to int ids
mappingl={}
for i in range(len(labelsm['label'])):
  mappingl[labelsm['label'][i]]=i
for i in range(len(labels['label'])):
  labels['label'][i]=mappingl[labels['label'][i].strip()]

In [69]:
u = np.concatenate([src, dst])
v = np.concatenate([dst, src])
g = dgl.graph((u,v), num_nodes=len(nodes))#undirected graph

In [70]:
#feature adjacency matrix
from numpy import genfromtxt
feat = genfromtxt('feat.csv', delimiter=',')
g.ndata['feat']=torch.from_numpy(feat)
g.ndata['feat']=g.ndata['feat'].type(torch.FloatTensor)

In [71]:
#labels
g.ndata['label'] = torch.from_numpy(labels['label'].astype('category').cat.codes.to_numpy()).type(torch.LongTensor)

In [72]:
#train, validation, test split = 90%, 5%, 5%
n_nodes = len(nodes)
n_train = int(n_nodes* 0.9)
n_val = int(n_nodes* 0.05)
train_mask = torch.zeros(n_nodes, dtype=torch.bool)
val_mask = torch.zeros(n_nodes, dtype=torch.bool)
test_mask = torch.zeros(n_nodes, dtype=torch.bool)
train_mask[:n_train] = True
val_mask[n_train:n_train + n_val] = True
test_mask[n_train + n_val:] = True
g.ndata['train_mask'] = train_mask
g.ndata['val_mask'] = val_mask
g.ndata['test_mask'] = test_mask

In [73]:
print('Node features')
print(g.ndata)
print('Edge features')
print(g.edata)

Node features
{'feat': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.]]), 'label': tensor([0, 0, 0,  ..., 5, 5, 5]), 'train_mask': tensor([ True,  True,  True,  ..., False, False, False]), 'val_mask': tensor([False, False, False,  ..., False, False, False]), 'test_mask': tensor([False, False, False,  ...,  True,  True,  True])}
Edge features
{}


In [74]:
from dgl.nn import GraphConv

class GCN(nn.Module):
    def __init__(self, in_feats, h_feats, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GraphConv(in_feats, h_feats,allow_zero_in_degree=True)
        self.conv2 = GraphConv(h_feats, num_classes,allow_zero_in_degree=True)

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

#creating model
model = GCN(g.ndata['feat'].shape[1], 16, 6)

In [75]:
def train(g, model):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    best_val_acc = 0
    best_test_acc = 0

    features = (g.ndata['feat'])
    labels = g.ndata['label']
    train_mask = g.ndata['train_mask']
    val_mask = g.ndata['val_mask']
    test_mask = g.ndata['test_mask']
    for e in range(20):
        #forward
        logits = model(g, features)

        #computing prediction
        pred = logits.argmax(1)

        #computing losses of the nodes in the training set
        loss = F.cross_entropy(logits[train_mask], labels[train_mask])

        #computing accuracy on training/validation/test
        train_acc = (pred[train_mask] == labels[train_mask]).float().mean()
        val_acc = (pred[val_mask] == labels[val_mask]).float().mean()
        test_acc = (pred[test_mask] == labels[test_mask]).float().mean()

        #saving the best validation accuracy and the corresponding test accuracy.
        if best_val_acc < val_acc:
            best_val_acc = val_acc
            best_test_acc = test_acc

        #backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print('In epoch {}, loss: {:.3f}, val acc: {:.3f} (best {:.3f}), test acc: {:.3f} (best {:.3f})'.format(
                e, loss, val_acc, best_val_acc, test_acc, best_test_acc))
model = GCN(g.ndata['feat'].shape[1], 16, 6)
#training
train(g, model)

In epoch 0, loss: 1.804, val acc: 0.031 (best 0.031), test acc: 0.076 (best 0.076)
In epoch 1, loss: 1.742, val acc: 0.188 (best 0.188), test acc: 0.273 (best 0.273)
In epoch 2, loss: 1.682, val acc: 0.797 (best 0.797), test acc: 0.803 (best 0.803)
In epoch 3, loss: 1.623, val acc: 0.797 (best 0.797), test acc: 0.803 (best 0.803)
In epoch 4, loss: 1.564, val acc: 0.797 (best 0.797), test acc: 0.803 (best 0.803)
In epoch 5, loss: 1.504, val acc: 0.812 (best 0.812), test acc: 0.758 (best 0.758)
In epoch 6, loss: 1.448, val acc: 0.812 (best 0.812), test acc: 0.758 (best 0.758)
In epoch 7, loss: 1.399, val acc: 0.812 (best 0.812), test acc: 0.758 (best 0.758)
In epoch 8, loss: 1.360, val acc: 0.812 (best 0.812), test acc: 0.758 (best 0.758)
In epoch 9, loss: 1.331, val acc: 0.812 (best 0.812), test acc: 0.758 (best 0.758)
In epoch 10, loss: 1.310, val acc: 0.797 (best 0.812), test acc: 0.803 (best 0.758)
In epoch 11, loss: 1.295, val acc: 0.797 (best 0.812), test acc: 0.803 (best 0.758)
In

In [76]:
#saving parameters
torch.save(model, "./demo.pth")