## 1. Code base

In [19]:
import importlib
import torch
from torch_geometric.data import DataLoader
import networkx as nx
from torch_geometric.data import Data
import time
import pickle
from torch_geometric.data import InMemoryDataset
import numpy as np


      
    
def writeAdjacencyMatrixToDisk(G, filename='temp_adjacency_matrix.txt'):
    """
        Transform to networkx dataset

        possible formats: GML, Adjacency matrix, ..
        start by Adjcency list 
             --> (ignoring edge/node features)
             --> line format: source target target2 target3 ... 
        later we can improve this...
    """
    f = open(filename,'w')
    _ni=-1
    newline = False
    theline = []
    careturn = ""
    for ei in range(G.edge_index.size()[1]):
        if int(G.edge_index[0,ei].item()) != _ni:
            newline=True
            _ni=int(G.edge_index[0,ei].item())
            
        else:
            newline=False
            
            
        ni = str(G.edge_index[0,ei].item())
        vi = str(G.edge_index[1,ei].item())
        if newline:
            f.write(''.join(theline))
            #print(''.join(theline))
            #print(" --> "+str(_ni))
            theline =[]
            theline.append(careturn+ni+" ")
            theline.append(vi+" ")
            careturn = "\n"
        else:
            theline.append(vi+" ")
        # print("({},{})".format(ni,vi))
    
    
def nx_createNxGraphInMem(G):
    """
        Transform to networkx dataset

        possible formats: GML, Adjacency matrix, ..
        start by Adjcency list 
             --> (ignoring edge/node features)
             --> line format: source target target2 target3 ... 
        later we can improve this...
    """
    g = nx.MultiGraph()
   
    for ei in range(G.edge_index.size()[1]):    
        ni = str(G.edge_index[0,ei].item())
        vi = str(G.edge_index[1,ei].item())
        g.add_edge(ni,vi)
    return g
    
def nx_verifyEdges(G, g):
    for ei in range(G.edge_index.size()[1]):
        ni = str(G.edge_index[0,ei].item())
        vi = str(G.edge_index[1,ei].item())
        if (ni,vi,0) not in list(g.edges):
            if (vi,ni,1) not in list(g.edges):
                print("Error {} not in networkx graph".format((ni,vi)))
            
        

def nx_compute_edge_betweenness(G):
    
    #print(list(G.edges)[:10])
    G_components = nx.connected_component_subgraphs(G)
    G_mc = list(G_components)[0]  
    eb_dict_res = {}
    eb_dict = nx.edge_betweenness_centrality(G_mc)
    
    # if there are more connected components...
    if len(list(G_components))>1:
        print("WARNING connected components: ",len(list(G_components)))
    
    eb_dict_res.update(eb_dict)
    
        
    return eb_dict_res

def nx_compute_node_betweenness(G):
    
    #print(list(G.edges)[:10])
    G_components = nx.connected_component_subgraphs(G)
    G_mc = list(G_components)[0]  
    eb_dict_res = {}
    eb_dict = nx.betweenness_centrality(G_mc)
    
    # if there are more connected components...
    if len(list(G_components))>1:
        print("WARNING connected components: ",len(list(G_components)))
    
    eb_dict_res.update(eb_dict)
    
        
    return eb_dict_res


def update_edge_betweenness(G, eb_dict):
    """
        FOR UNDIRECTED GRAPHS
    
        G.edge_attr must contain the edge betweenness values 
        for each edge
        
        G.y must contain it also.. (it is a copy of the edge betweenness..)
        this could help the training phase
        
        Size restrictions:
        - Given the size of the graphs, is it better to just transform the 
        object instead to write a new one?
        - also just use G.y? but for GNN algorithms..not sure
        
        new_edg_attr will be size [num edges, 1]
        and must be sorted in accordance to G.edge_index
        
    
    """
    
    new_edg_attr = []
    for i in range(len(G.edge_index[0])):
        ni = G.edge_index[0][i]
        vi = G.edge_index[1][i]
        
        if ni and vi:
            ni=str(ni.item())
            vi=str(vi.item())
            #print((ni,vi))
            try:
                new_edg_attr.append([eb_dict[(ni,vi)]])
            except:
                try:
                    new_edg_attr.append([eb_dict[(vi,ni)]])
                except:
                    #print("ERROR {} and {} not found!".format((ni,vi),(vi,ni)))
                    new_edg_attr.append([0])
        else:
            new_edg_attr.append([0])

    new_edg_attr = torch.FloatTensor(new_edg_attr)
    
    #newG = Data(
    #    x=G.x, 
    #    edge_index=G.edge_index, 
    #    edge_attr=new_edg_attr,
    #    y=new_edg_attr)
    
    #G.edge_attr = new_edg_attr
    G.y = new_edg_attr
    
    return G


def update_node_betweenness(G, eb_dict):
    """
        Get nodes keys from eb_dict and get their betweenness centrality
        G.y will have all centralities of al lnodes following the order
        of a list of the nodes sorted by id

        add spaces in between!

    """
    betweennesses = []
    nodes = sorted([int(k) for k in eb_dict.keys()])
    for node in range(nodes[-1]+1):
        try:
            betweennesses.append(eb_dict[str(node+1)])    
        except:
            betweennesses.append(0.0)
            
    G.y = torch.FloatTensor(betweennesses)
    return G

def get_betweenness_into_dict(G):
    """
        FOR UNDIRECTED GRAPHS
    """
    
    eb_dict ={}
    for i in range(len(G.edge_index[0])):
        ni = G.edge_index[0][i]
        vi = G.edge_index[1][i]
        
        if ni and vi:
            ni=str(ni.item())
            vi=str(vi.item())
            eb_dict[(ni,vi)] = float(G.y[i].item())
    return eb_dict


def pyTorchGeometricDatasetToNx(G,suffix=0):
    """
        Alternatives:
            - to disk, to nx, then dict of betweenness
            - transform in memory
            - directly pickle a G object with the betweenness
    """
    prefix = 'temp_aj_m'
    # 1. PyTorch Geometric graph -> nx -> compute betweenness 
    #             -> PyTorch Geom with target the betweenness-------
    # Transform to networkx graph
    # write to adjacency matrix on disk
    writeAdjacencyMatrixToDisk(G, filename=prefix+str(suffix)+'.txt')

    # load into a networkx graph object
    g2 = nx.read_adjlist(prefix+str(suffix)+'.txt')
    #g2 = nx_createNxGraphInMem(G)
    
    return g2

def computeBetweenness(G,suffix=0):
    """
        Alternatives:
            - to disk, to nx, then dict of betweenness
            - transform in memory
            - directly pickle a G object with the betweenness
    """
    prefix = 'temp_aj_m'
    # 1. PyTorch Geometric graph -> nx -> compute betweenness 
    #             -> PyTorch Geom with target the betweenness-------
    # Transform to networkx graph
    # write to adjacency matrix on disk
    writeAdjacencyMatrixToDisk(G, filename=prefix+str(suffix)+'.txt')

    # load into a networkx graph object
    g2 = nx.read_adjlist(prefix+str(suffix)+'.txt')
    #g2 = nx_createNxGraphInMem(G)

    # compute node betweenness centrality
    eb_dict = nx_compute_node_betweenness(g2)
    #print("eb_dict",eb_dict)
    
    # write node betweenness back to PyTorch Geometric graph
    update_node_betweenness(G,eb_dict)
    #return G
    


class MyOwnDataset(InMemoryDataset):
    def __init__(self, root, name, transform=None, pre_transform=None):
        f = open(name, 'rb')
        self.data = pickle.load(f) 
        #data_list = [G.x, G.edge_index, G.test_mask, G.train_mask, G.val_mask, G.y, G.batch]
        f.close()
        #print("root ", root, " name ", name)
        #print("setting a self.name in the object!")
        #self.name = name
        #print(dir(self))
        #super(MyOwnDataset, self).__init__(root,transform, pre_transform)
        #self.data = torch.load(self.processed_paths[0])
        
        
    
    
    @property
    def raw_file_names(self):
        return ['PPI0.pickle']

    @property
    def processed_file_names(self):
        return ['PPI0']

    def download(self):
        # Download to `self.raw_dir`.
        return True

    def process(self):
        # Read data into huge `Data` list.
        
        # unpickle the graph
        print("going to unpickle")
        f = open(self.name, 'rb')
        G = pickle.load(f) 
        #data_list = [G.x, G.edge_index, G.test_mask, G.train_mask, G.val_mask, G.y, G.batch]
        f.close()
        
        #if self.pre_filter is not None:
        #    data_list [data for data in data_list if self.pre_filter(data)]

        #if self.pre_transform is not None:
        #    data_list = [self.pre_transform(data) for data in data_list]

        #data, slices = self.collate(data_list)
        torch.save(G, self.processed_paths[0])
        
        
class MyOwnDataset2():
    def __init__(self,  name, transform=None, pre_transform=None):
        f = open(name, 'rb')
        self.data = pickle.load(f) 
        f.close()
    
def loadDataset(collection, name=None, split=None):
    # import datasets
    themodule = importlib.import_module("torch_geometric.datasets")
    # get the function corresponding to collection
    method_to_call = getattr(themodule, collection)
    if name:
        return method_to_call(root='./data/'+str(collection), name=name)
    elif split:
        return method_to_call(root='./data/'+str(collection), split=split)
    else:
        return method_to_call(root='./data/'+str(collection)) 

    
def createDataset(x, edge_index):
    return Data(x=x, edge_index=edge_index)
    

def createDatasetFromNX(g, undirected=True):
    # get edge list
    edges = g.edges
    edge_list_1 = []
    edge_list_2 = []
    for e in edges:
        # node id must be an int
        edge_list_1.append(int(e[0])) 
        edge_list_2.append(int(e[1]))
        if undirected:
            edge_list_1.append(int(e[1])) 
            edge_list_2.append(int(e[0]))
            
        
    edge_index = torch.tensor([ edge_list_1,
                                edge_list_2], dtype=torch.long)
    
    # create single 1 feature for each node
    n = len(g.nodes())
    x = [[1.0] for i in range(n)]
    x = torch.tensor(x, dtype=torch.float)
    
    return createDataset(x, edge_index)
                         
def createDatasetFromNXwithTarget(g,y, undirected=True):
    dataset =  createDatasetFromNX(g,undirected)
    y = torch.FloatTensor(y)
    dataset.y = y 
    return dataset
                   

## 2. Random Graph datasets

### Tests

In [2]:
# generate random graphs
er = nx.erdos_renyi_graph(100, 0.15)
ws = nx.watts_strogatz_graph(30, 3, 0.1)
ba = nx.barabasi_albert_graph(100, 5)
red = nx.random_lobster(100, 0.9, 0.9)
g=er


# compute its node and edge betweenness
nx_betweenness = nx.betweenness_centrality(g)
nx_edge_betweenness = nx.edge_betweenness_centrality(g)
y = [ v for k,v in nx_betweenness.items()]

# verify order of betweenness is the same as  order of edge_list
#print(nx_betweenness)
#print(y)
#-> ok!

# translate into a PyTorch Geometric dataset 
dataset = createDatasetFromNXwithTarget(g,y)

# check dimensions
print(dataset) # check edge_index[1] and y have same length


# save as a pickled object
dname = "er_100_0_15"
i=0
with open(dname+"_"+str(i)+'.pickle','wb') as f:
    pickle.dump(dataset,f)



Data(edge_index=[2, 740], x=[100, 1], y=[100])


### Implementation

In [13]:

def createRandomGraphDataset(g,dname,betweenness):
    
    # compute its node and edge betweenness
    y =[]
    
    if betweenness == 'node':
        nx_betweenness = nx.betweenness_centrality(g)
        y = [ v for k,v in nx_betweenness.items()]
    else:
        nx_edge_betweenness = nx.edge_betweenness_centrality(g)
        #y = [ v for k,v in nx_edge_betweenness.items()]
        # we need to double it? because undirected graph show 2 edge each in one direction?
        y = []
        for k,v in nx_edge_betweenness.items():
            y.append(v)
            y.append(v)
        
        
    # verify order of betweenness is the same as  order of edge_list
    #print(nx_betweenness)
    #print(y)
    #-> ok!

    # translate into a PyTorch Geometric dataset 
    dataset = createDatasetFromNXwithTarget(g,y)
    print(dataset.num_features)

    # check dimensions
    print(dataset) # check edge_index[1] and y have same length
    print(" directed graph: ",dataset.is_directed())
    print("isolated nodes: ",dataset.contains_isolated_nodes())
    print("self loops: ",dataset.contains_self_loops())
    
    # save as a pickled object
    with open(dname+'.pickle','wb') as f:
        pickle.dump(dataset,f)

In [14]:
er = nx.erdos_renyi_graph(100, 0.15)
ws = nx.watts_strogatz_graph(30, 3, 0.1)
ba = nx.barabasi_albert_graph(100, 5)
red = nx.random_lobster(100, 0.9, 0.9)


createRandomGraphDataset(er,'er_100_0_15_nb','node')



1
Data(edge_index=[2, 1516], x=[100, 1], y=[100])
 directed graph:  False
isolated nodes:  False
self loops:  False


In [20]:
er = nx.erdos_renyi_graph(100, 0.15)
createRandomGraphDataset(er,'er_100_0_15_nb','node')
er = nx.erdos_renyi_graph(100, 0.45)
createRandomGraphDataset(er,'er_100_0_45_nb','node')
er = nx.erdos_renyi_graph(1000, 0.15)
createRandomGraphDataset(er,'er_1000_0_15_nb','node')
er = nx.erdos_renyi_graph(1000, 0.45)
createRandomGraphDataset(er,'er_1000_0_45_nb','node')


ws = nx.watts_strogatz_graph(30, 3, 0.1)
createRandomGraphDataset(ws,'ws_30_3_0_1_nb','node')
ws = nx.watts_strogatz_graph(100, 3, 0.1)
createRandomGraphDataset(ws,'ws_100_3_0_1_nb','node')
ws = nx.watts_strogatz_graph(1000, 3, 0.1)
createRandomGraphDataset(ws,'ws_1000_3_0_1_nb','node')


ws = nx.watts_strogatz_graph(1000, 10, 0.1)
createRandomGraphDataset(ws,'ws_1000_10_0_1_nb','node')


ba = nx.barabasi_albert_graph(100, 5)
createRandomGraphDataset(ba,'ba_100_5_nb','node')
ba = nx.barabasi_albert_graph(1000, 5)
createRandomGraphDataset(ba,'ba_1000_5_nb','node')



#er = nx.erdos_renyi_graph(4000, 0.15)
#createRandomGraphDataset(er,'er_4000_0_15_nb','node')
#er = nx.erdos_renyi_graph(4000, 0.35)
#createRandomGraphDataset(er,'er_4000_0_35_nb','node')
#ws = nx.watts_strogatz_graph(4000, 3, 0.1)
#createRandomGraphDataset(ws,'ws_4000_3_0_1_nb','node')
#ws = nx.watts_strogatz_graph(4000, 20, 0.1)
#createRandomGraphDataset(ws,'ws_4000_20_0_1_nb','node')
#ba = nx.barabasi_albert_graph(4000, 5)
#createRandomGraphDataset(ba,'ba_4000_5_nb','node')


1
Data(edge_index=[2, 1542], x=[100, 1], y=[100])
 directed graph:  False
isolated nodes:  False
self loops:  False
1
Data(edge_index=[2, 4432], x=[100, 1], y=[100])
 directed graph:  False
isolated nodes:  False
self loops:  False
1
Data(edge_index=[2, 149596], x=[1000, 1], y=[1000])
 directed graph:  False
isolated nodes:  False
self loops:  False
1
Data(edge_index=[2, 449110], x=[1000, 1], y=[1000])
 directed graph:  False
isolated nodes:  False
self loops:  False
1
Data(edge_index=[2, 60], x=[30, 1], y=[30])
 directed graph:  False
isolated nodes:  False
self loops:  False
1
Data(edge_index=[2, 200], x=[100, 1], y=[100])
 directed graph:  False
isolated nodes:  False
self loops:  False
1
Data(edge_index=[2, 2000], x=[1000, 1], y=[1000])
 directed graph:  False
isolated nodes:  False
self loops:  False
1
Data(edge_index=[2, 10000], x=[1000, 1], y=[1000])
 directed graph:  False
isolated nodes:  False
self loops:  False
1
Data(edge_index=[2, 950], x=[100, 1], y=[100])
 directed graph

KeyboardInterrupt: 

In [21]:
# repeat with edge betweenness
er = nx.erdos_renyi_graph(100, 0.15)
createRandomGraphDataset(er,'er_100_0_15_eb','edge')
er = nx.erdos_renyi_graph(100, 0.45)
createRandomGraphDataset(er,'er_100_0_45_eb','edge')
er = nx.erdos_renyi_graph(1000, 0.15)
createRandomGraphDataset(er,'er_1000_0_15_eb','edge')
er = nx.erdos_renyi_graph(1000, 0.45)
createRandomGraphDataset(er,'er_1000_0_45_eb','edge')


ws = nx.watts_strogatz_graph(30, 3, 0.1)
createRandomGraphDataset(ws,'ws_30_3_0_1_eb','edge')
ws = nx.watts_strogatz_graph(100, 3, 0.1)
createRandomGraphDataset(ws,'ws_100_3_0_1_eb','edge')
ws = nx.watts_strogatz_graph(1000, 3, 0.1)
createRandomGraphDataset(ws,'ws_1000_3_0_1_eb','edge')


ws = nx.watts_strogatz_graph(1000, 10, 0.1)
createRandomGraphDataset(ws,'ws_1000_10_0_1_eb','edge')


ba = nx.barabasi_albert_graph(100, 5)
createRandomGraphDataset(ba,'ba_100_5_eb','edge')
ba = nx.barabasi_albert_graph(1000, 5)
createRandomGraphDataset(ba,'ba_1000_5_eb','edge')


#er = nx.erdos_renyi_graph(4000, 0.15)
#createRandomGraphDataset(er,'er_4000_0_15_eb','edge')
#er = nx.erdos_renyi_graph(4000, 0.35)
#createRandomGraphDataset(er,'er_4000_0_35_eb','edge')
#ws = nx.watts_strogatz_graph(4000, 3, 0.1)
#createRandomGraphDataset(ws,'ws_4000_3_0_1_eb','edge')
#ws = nx.watts_strogatz_graph(4000, 20, 0.1)
#createRandomGraphDataset(ws,'ws_4000_20_0_1_eb','edge')
#ba = nx.barabasi_albert_graph(4000, 5)
#createRandomGraphDataset(ba,'ba_4000_5_eb','edge')

1
Data(edge_index=[2, 1386], x=[100, 1], y=[693])
 directed graph:  False
isolated nodes:  False
self loops:  False
1
Data(edge_index=[2, 4534], x=[100, 1], y=[2267])
 directed graph:  False
isolated nodes:  False
self loops:  False
1
Data(edge_index=[2, 149652], x=[1000, 1], y=[74826])
 directed graph:  False
isolated nodes:  False
self loops:  False
1
Data(edge_index=[2, 449736], x=[1000, 1], y=[224868])
 directed graph:  False
isolated nodes:  False
self loops:  False
1
Data(edge_index=[2, 60], x=[30, 1], y=[30])
 directed graph:  False
isolated nodes:  False
self loops:  False
1
Data(edge_index=[2, 200], x=[100, 1], y=[100])
 directed graph:  False
isolated nodes:  False
self loops:  False
1
Data(edge_index=[2, 2000], x=[1000, 1], y=[1000])
 directed graph:  False
isolated nodes:  False
self loops:  False
1
Data(edge_index=[2, 10000], x=[1000, 1], y=[5000])
 directed graph:  False
isolated nodes:  False
self loops:  False
1
Data(edge_index=[2, 950], x=[100, 1], y=[475])
 directed g

## 3. Benchmark datasets

In [18]:

# plan
# 1. get graph from PyTorch Geom
# 2. transform to nx (pyTorchGeometricDatasetToNx)
# 3. call createRandomGraphDataset(ba,'ba_1000_5','edge') 
#      or createRandomGraphDataset(ba,'ba_1000_5','node')


def processDatasetsSingle(dname, dataset):
    G = dataset.data
    print(G)
    g = pyTorchGeometricDatasetToNx(G,i)
    i+=1
    print(i)
    createRandomGraphDataset(g,dname+'_'+str(i)+'_eb','edge') 
    createRandomGraphDataset(g,dname+'_'+str(i)+'_nd','node') 

        
def processDatasets(dname, dataset):
    # set size of batch to total size of graph here
    loader = DataLoader(dataset, shuffle=False)
    i = 0
    for G in loader:
        print(G)
        g = pyTorchGeometricDatasetToNx(G,i)
        i+=1
        print(i)
        createRandomGraphDataset(g,dname+'_'+str(i)+'_eb','edge') 
        createRandomGraphDataset(g,dname+'_'+str(i)+'_nd','node') 


#KarateClub
dname='KarateClub'
dataset = loadDataset(dname)
processDatasets(dname, dataset)

#ENZYMES FROM TUDataset
dname='TUDataset'
name='ENZYMES'
dataset = loadDataset(dname,name)
processDatasets(dname, dataset)

#PROTEINS FROM TUDataset
dname='TUDataset'
name='PROTEINS'
dataset = loadDataset(dname,name)
processDatasets(dname, dataset)

#PPI
dname='PPI'
dataset = loadDataset(dname)
processDatasets(dname,dataset)

#QM7b
dataset = loadDataset('QM7b')
processDatasets(dname,dataset)

#---------------------------------------------------------------------
#MUTAG
#dataset = loadDataset(collection='Entities',name='MUTAG')

# Cora
#dataset = loadDataset(collection='Planetoid',name='Cora')
#processDatasets(dname,dataset)



        
        


Batch(batch=[56944], edge_index=[2, 818716], test_mask=[56944], train_mask=[56944], val_mask=[56944], x=[56944, 50], y=[56944, 121])


KeyboardInterrupt: 