This notebook shows how a GCN/GraphSAGE model is trained to compute Node betweenness centrality on different graphs

In [8]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(dataset.num_features, 16)
        self.conv2 = GCNConv(16, dataset.num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        # output as multiclass target
        #return F.log_softmax(x, dim=1)
        
        # output as regression target
        return x
    
    


In [31]:
import networkx as nx
import time
import torch
from torch_geometric.data import DataLoader
import importlib
import torch
from torch_geometric.data import DataLoader
import networkx as nx
from torch_geometric.data import Data
import pickle

def loadDataset(collection, name=None):
    try:
        # import datasets
        themodule = importlib.import_module("torch_geometric.datasets")
        # get the function corresponding to collection
        method_to_call = getattr(themodule, collection)
        if name:
            return method_to_call(root='./data/'+str(collection), name=name)
        else:
            return method_to_call(root='./data/'+str(collection)) 
    except:
        # custom module
        method_to_call = globals()[collection]
       
        if name:
            return method_to_call(root='./data/'+str(collection), name=name)
        else:
            return method_to_call(root='./data/'+str(collection)) 
        


def shuffleTrainTestMasks(data, trainpct = 0.7):
    ysize = list(data.y.size())[0]
    data.train_mask = torch.zeros(ysize,1, dtype=torch.long)
    data.train_mask[int(ysize*trainpct):] = 1
    data.train_mask = data.train_mask[torch.randperm(ysize)]
    data.test_mask = torch.ones(ysize,1, dtype=torch.long) - data.train_mask
    
    
def shuffleTrainTestValMasks(data, trainpct = 0.7, valpct = 0.2):

    ysize = list(data.y.size())[0]
    #print("total ", ysize)
    #print(" train ",int(ysize*trainpct)-int(ysize*trainpct*valpct))
    #print(" val ",int(ysize*trainpct*valpct))
    #print(" test ",int(ysize*(1- trainpct) ))
    data.train_mask = torch.zeros(ysize,1, dtype=torch.long)
    data.train_mask[:int(ysize*trainpct)] = 1
    data.train_mask = data.train_mask[torch.randperm(ysize)]
    #print(" train sum ",data.train_mask.sum())
    data.test_mask = torch.ones(ysize,1, dtype=torch.long) - data.train_mask
    #print(" test sum ",data.test_mask.sum())
    
    # quick and dirt
    # set first ysize*trainpct*valpct to 0, for those that are 1
    data.val_mask = torch.zeros(ysize,1, dtype=torch.long)
    #print(" val sum ",data.val_mask.sum())
    data.val_mask[:int(ysize*trainpct*valpct)] = 1
    #print(" val sum ",data.val_mask.sum())
    data.val_mask = data.val_mask[torch.randperm(ysize)]
    #print(" val sum ",data.val_mask.sum())
    data.val_mask = data.val_mask - data.test_mask
    #print(" val sum ",data.val_mask.sum())
    data.val_mask[data.val_mask <= 0 ]= 0
    #print(" val sum ",data.val_mask.sum())

    while data.val_mask.sum() < int(ysize*trainpct*valpct):
        data.val_mask = torch.zeros(ysize,1, dtype=torch.long)
        #print(" val sum ",data.val_mask.sum())
        data.val_mask[:int(ysize*trainpct*valpct)] = 1
        #print(" val sum ",data.val_mask.sum())
        data.val_mask = data.val_mask[torch.randperm(ysize)]
        #print(" val sum ",data.val_mask.sum())
        data.val_mask = data.val_mask - data.test_mask
        #print(" val sum ",data.val_mask.sum())
        data.val_mask[data.val_mask <= 0 ]= 0
        #print(" val sum ",data.val_mask.sum())
    
        
    #print("final val sum ",data.val_mask.sum())
    data.train_mask = data.train_mask - data.val_mask
    #print("final train sum ",data.train_mask.sum())
    
    #print(data.train_mask)
    print(data.train_mask.sum())
    #print(data.val_mask)
    print(data.val_mask.sum())
    #print(data.test_mask)  
    print(data.test_mask.sum())
    

def trainTestEval(dataset, iterations=1, batch_size=32):
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    i = 0
    print(loader)
    print(dir(loader))
    
    G = dataset.data
    print(G)
    start = time.time()


    # 1.  prepare model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    #print("using ",device)
    model = Net().to(device)
    data = G.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
    model.train()

    # 2.  create a train_mask, and a test_mask (val_mask for further experiments)
    #shuffleTrainTestMasks(data)
    shuffleTrainTestValMasks(data)

    # 3. train some epochs
    for epoch in range(200):
        optimizer.zero_grad()
        out = model(data)
        loss = F.mse_loss(out[data.train_mask], data.y[data.train_mask])
        loss.backward()
        optimizer.step()
        if epoch % 25 == 0 :
            print("epoch-loss: ",epoch, loss)

    # 4. Model evaluation
    model.eval()
    #  classification in a multiclass setting
    #_, pred = model(data).max(dim=1)
    #correct = pred[data.test_mask].eq(data.y[data.test_mask]).sum().item()
    #acc = correct / data.test_mask.sum().item()
    #print('Accuracy: {:.4f}'.format(acc))


    # regression 
    pred = model(data)
    print("target: ",data.y[data.test_mask])
    print("prediction: ",pred[data.test_mask])
    #print(pred[data.test_mask].type())
    #print(data.y[data.test_mask].type())
    # prepare the normalized mean root squared error
    t = data.y[data.test_mask]
    y = pred[data.test_mask]
    nrmse = torch.sum((t - y) ** 2)/data.test_mask.sum()
    nrmse = nrmse.sqrt()
    print("RMSE: ",nrmse)

    #m = torch.mean(t)
    #print("mean",m)
    #tmax = torch.max(t)
    #tmin = torch.min(t)
    #sd = tmax-tmin
    #print("sd",sd)
    #nrmse = (nrmse - m)/sd
    #print("NRMSE:",nrmse)


    endtime = time.time()
    print("Total train-test time: "+str(endtime-start))

    #i+=1
    #if i==1:
    #    break

In [23]:
class MyOwnDataset2():
    def __init__(self,  root, name, transform=None, pre_transform=None):
        f = open(name, 'rb')
        self.data = pickle.load(f) 
        f.close()
        
        

In [34]:
# load the dataset examples---------------------------------------

#PPI
#dataset = loadDataset('PPI')
#QM7b
#dataset = loadDataset('QM7b')
#MUTAG
#dataset = loadDataset(collection='Entities',name='MUTAG')
#ENZYMES FROM TUDataset
#dataset = loadDataset(collection='TUDataset',name='ENZYMES')
# Cora
#dataset = loadDataset(collection='Planetoid',name='Cora')

dataset = loadDataset(collection='MyOwnDataset2', name='PPI1.pickle')

trainTestEval(dataset, iterations=2, batch_size=500)

<torch_geometric.data.dataloader.DataLoader object at 0x7eff592ece48>
['_DataLoader__initialized', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'batch_sampler', 'batch_size', 'collate_fn', 'dataset', 'drop_last', 'num_workers', 'pin_memory', 'sampler', 'timeout', 'worker_init_fn']
PPI()


AttributeError: 'MyOwnDataset2' object has no attribute 'num_features'